In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import traceback
import time
import json
import pandas as pd

In [2]:
def load_cookies_from_file(driver, path):
    with open(path, 'r') as cookiesfile:
        cookies = json.load(cookiesfile)
        for cookie in cookies:
            # Adjust the cookie domain if necessary
            if 'domain' in cookie and cookie['domain'].startswith('.'):
                cookie['domain'] = cookie['domain'][1:]
            # Ensure sameSite is valid
            if 'sameSite' in cookie and cookie['sameSite'] not in ["Strict", "Lax", "None"]:
                del cookie['sameSite']
            driver.add_cookie(cookie)

def save_cookies_to_file(driver, path):
    with open(path, 'w') as cookiesfile:
        json.dump(driver.get_cookies(), cookiesfile)

def scrape_hashtag_posts(driver, hashtag, num_posts=10):
    # Navigate to the hashtag page
    driver.get(f"https://www.instagram.com/explore/search/keyword/?q={hashtag}")
    
    # Wait for the page to load
    time.sleep(15)
    print("Page loaded")

    # Extract unique post URLs
    unique_posts = set()
    count = 0
    while len(unique_posts) < num_posts:
        current_posts = len(unique_posts)
        # Scroll down to load more posts (you can adjust the range for more scrolling)
        for i in range(3):  # Change range for more scrolling
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)  # Wait for new posts to load
            print(f"Scrolled {i+1} times")

        # Find all post links
        post_links = driver.find_elements(By.TAG_NAME, 'a')
        print(f"Found {len(post_links)} links")

        # Remove duplicates in post_links
        post_links = list(set(post_links))

        for link in post_links:
            href = link.get_attribute('href')
            if href and '/p/' in href:  # Filter only post URLs
                if len(unique_posts) >= num_posts:
                    break
                unique_posts.add(href)
        if len(unique_posts) == current_posts:
            count += 1
        else:
            count = 0
        if count == 5:
            break
        print(f"Found {len(unique_posts)} unique posts")

    # # Scroll down to load more posts (you can adjust the range for more scrolling)
    # for i in range(3):  # Change range for more scrolling
    #     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #     time.sleep(3)  # Wait for new posts to load
    #     print(f"Scrolled {i+1} times")

    # # Find all post links
    # post_links = driver.find_elements(By.TAG_NAME, 'a')
    # print(f"Found {len(post_links)} links")


    # for link in post_links:
    #     href = link.get_attribute('href')
    #     if href and '/p/' in href:  # Filter only post URLs
    #         unique_posts.add(href)
    # print(f"Found {len(unique_posts)} unique posts")
    unique_posts_json = json.dumps(list(unique_posts), indent=4)
    with open('instagram_posts_urls.json', 'w') as json_file:
        json_file.write(unique_posts_json)
    print("Count:", count)
    return unique_posts

def extract_post_metadata(driver, post_url):
    driver.get(post_url)
    time.sleep(5)  # Wait for the page to load

    metadata = {}
    try:
        # # Extract caption
        # try:
        #     caption_element = driver.find_element(By.XPATH, "//div[@class='_a9zs']/h1")
        #     metadata['caption'] = caption_element.text
        # except:
        #     metadata['caption'] = ''

        # Extract date posted
        try:
            date_element = driver.find_element(By.XPATH, "//time[@class='x1p4m5qa']")
            metadata['date_posted'] = date_element.get_attribute('datetime')
        except:
            metadata['date_posted'] = ''

        # # Extract comments
        # comments_elements = driver.find_elements(By.XPATH, "//div[@class='_a9zs']/span")
        # metadata['comments'] = [comment.text for comment in comments_elements]

        # # Extract usernames of commenters
        # usernames_elements = driver.find_elements(By.XPATH, "//h3[@class='_a9zc']/div/a")
        # metadata['commenters'] = [username.text for username in usernames_elements]

    except Exception as e:
        print(f"An error occurred while extracting metadata from {post_url}: {e}")
        traceback.print_exc()

    return metadata

def scrape_metadata(driver, post_urls):
    data = []
    for url in post_urls:
        # Save and load new cookies after every 5 posts
        if len(data) % 5 == 0:
            save_cookies_to_file(driver, "cookies.json")
            time.sleep(5)  # Wait for cookies to be saved
            load_cookies_from_file(driver, "cookies.json")
            time.sleep(5)  # Wait for cookies to be loaded
            driver.refresh()

        # Every 10 posts, stop for 30 seconds to avoid getting blocked
        if len(data) % 10 == 0 and len(data) > 0:
            print("Pausing for 300 seconds...")
            time.sleep(300)

        metadata = extract_post_metadata(driver, url)
        metadata['url'] = url
        data.append(metadata)
    return data

if __name__ == "__main__":
    HASHTAG = 'lashes'  # The hashtag you want to scrape

    service = Service(r'E:\Tools\chromedriver-win64\chromedriver-win64\chromedriver.exe')  # Update with the path to chromedriver
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)  # Keep the browser open after the script ends
    # options.add_argument('--headless=new')  # Run in headless mode
    options.binary_location = r'E:\Tools\chrome-win64\chrome-win64\chrome.exe'  # Update with the path to chrome binary
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get("https://www.instagram.com/")
        print("Waiting for login...")
        time.sleep(3)  # Wait for the page to load

        load_cookies_from_file(driver, "cookies.json")  # Load cookies from file
        time.sleep(5)  # Wait for cookies to be loaded
        driver.refresh()  # Refresh to apply cookies
        print("Cookies loaded")

        # Check if the session is still valid
        driver.get("https://www.instagram.com/")
        print("Checking session...")
        time.sleep(5)
        if "login" in driver.current_url:
            print("Session is not valid. Please check the cookies.")
        else:
            print("Session is valid. Proceeding to scrape.")

            driver.get(f"https://www.instagram.com/explore/search/keyword/?q={HASHTAG}")  # Navigate to the hashtag page

            post_urls = scrape_hashtag_posts(driver, HASHTAG, 10000)  # Scrape posts

            # Print out the collected post URLs
            for url in post_urls:
                print(url)

            # # Extract metadata from each post URL
            # metadata_list = scrape_metadata(driver, post_urls)

            # # Convert metadata list to JSON
            # metadata_json = json.dumps(metadata_list, indent=4)
            # print(metadata_json)
            
            # # Save JSON to file
            # with open('instagram_posts_metadata.json', 'w') as json_file:
            #     json_file.write(metadata_json)

    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()  # Print the stack trace for more details

    finally:
        driver.quit()  # Close the browser

Waiting for login...
Cookies loaded
Checking session...
Session is valid. Proceeding to scrape.
Page loaded
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 48 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 96 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 144 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 192 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 240 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 288 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 336 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 384 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 431 unique posts
Scrolled 1 times
Scrolled 2 times
Scrolled 3 times
Found 72 links
Found 479 unique posts
Scro