In [None]:
"""
This script automates Instagram hashtag page scraping using Selenium.
It logs into Instagram, navigates to a hashtag page, and collects unique post URLs.
The process continues until 2500 unique posts are collected, with tqdm progress tracking.
"""

In [None]:
import os
import random
import time
from time import sleep

import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm

load_dotenv()

In [None]:
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
driver.maximize_window()

In [None]:
# Login function
def login_to_instagram(max_retries=3):
    """Login to Instagram with retry logic."""
    username_val = os.getenv("INSTA_USERNAME_2")
    password_val = os.getenv("INSTA_PASSWORD_2")

    if not username_val or not password_val:
        raise ValueError("Instagram credentials not found in environment variables")

    for attempt in range(max_retries):
        try:
            driver.get("https://www.instagram.com/")
            # Wait for the login page to load
            sleep(random.uniform(2, 3))

            # enter username
            username = driver.find_element(By.NAME, "username")
            username.send_keys(os.getenv("INSTA_USERNAME_2"))
            sleep(random.uniform(2, 3))

            # enter password
            password = driver.find_element(By.NAME, "password")
            password.send_keys(os.getenv("INSTA_PASSWORD_2"))
            sleep(random.uniform(1.5, 2.5))

            # submit
            password.submit()
            sleep(random.uniform(4, 6))
            print("Successfully logged in to Instagram")
            return True

        except (TimeoutException, NoSuchElementException) as e:
            print(f"Login attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                print("Maximum login retries exceeded")
                raise
            time.sleep(5)  # Wait before retry


# Login
login_to_instagram()

In [None]:
TARGET_POSTS = 500

In [None]:
HASHTAG = "jeetwin"
url = f"https://www.instagram.com/explore/tags/{HASHTAG.lstrip('#')}/"
driver.get(url)
time.sleep(random.uniform(5, 6.5))

In [None]:
def extract_metadata():
    single_post_url = driver.current_url
    return single_post_url

In [None]:
unique_urls = set()
metadata_list = []

In [None]:
div_post = driver.find_element(
    By.XPATH,
    "/html/body/div[1]/div/div/div[2]/div/div/div[1]/div[1]/div[1]/section/main/div/div/div[1]/div/div[1]/div[1]/div/a/div/div[2]",
)
time.sleep(random.uniform(1, 1.5))

In [None]:
actions = ActionChains(driver)
actions.move_to_element(div_post).perform()

In [None]:
col = 5
row_num = 0
col_num = 1
post_width = 235  # 234.66 rounded
post_height = 313  # 312.86 rounded
# col = 3
# row_num = 0
# col_num = 1
# post_width = 320  # 319.38 rounded
# post_height = 320  # 319.38 rounded

In [None]:
%%time
pbar = tqdm(total=TARGET_POSTS, desc="Collecting posts", unit="post")

try:
    while len(unique_urls) < TARGET_POSTS:
        # Click on the current position
        actions.click().perform()
        time.sleep(random.uniform(0.3, 0.7))
        # Extract metadata
        single_post_url = extract_metadata()
        # Only add if it's a unique URL
        if single_post_url not in unique_urls:
            unique_urls.add(single_post_url)
            metadata_list.append({"url": single_post_url, "hash_tag": HASHTAG})
            pbar.update(1)

        time.sleep(random.uniform(0.3, 0.7))

        # Close the post
        actions.send_keys(Keys.ESCAPE).perform()
        time.sleep(random.uniform(0.3, 0.7))

        # Move to next post
        if col_num < col:
            # Move right - create fresh ActionChains to avoid accumulation
            actions = ActionChains(driver)
            actions.move_by_offset(post_width, 0).perform()
            col_num += 1
        else:
            # Move back to first column and scroll down
            # Create fresh ActionChains and move left by (col-1) * width
            actions = ActionChains(driver)
            actions.move_by_offset(-(col - 1) * post_width, 0).perform()
            col_num = 1
            row_num += 1

            # Scroll down
            driver.execute_script(f"window.scrollBy(0, {post_height});")
            time.sleep(random.uniform(0.5, 1))

        time.sleep(random.uniform(0.3, 0.7))

        # Safety check - if we're stuck (same URL keeps appearing)
        # you might want to add logic to detect when Instagram runs out of posts

except KeyboardInterrupt:
    print("\nScraping interrupted by user")
finally:
    pbar.close()

In [None]:
metadata_df = pd.DataFrame(metadata_list)
metadata_df.to_csv(rf"posts/{HASHTAG}.csv", index=False)

In [None]:
len(metadata_df)

In [None]:
metadata_df.head()

In [None]:
driver.quit()