In [None]:
"""
This script logs into Instagram using Selenium, reads a CSV file of post URLs and hashtags,
and downloads image posts (skipping videos) into a specified folder. 
Existing images are not re-downloaded.
"""

In [None]:
import os
import random
import time
from time import sleep

import pandas as pd
import requests
from dotenv import load_dotenv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

load_dotenv()

In [None]:
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in background
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
driver.maximize_window()

In [None]:
# Login function
def login_to_instagram(max_retries=3):
    """Login to Instagram with retry logic."""
    username_val = os.getenv("INSTA_USERNAME")
    password_val = os.getenv("INSTA_PASSWORD")

    if not username_val or not password_val:
        raise ValueError("Instagram credentials not found in environment variables")

    for attempt in range(max_retries):
        try:
            driver.get("https://www.instagram.com/")
            # Wait for the login page to load
            sleep(random.uniform(2, 3))

            # enter username
            username = driver.find_element(By.NAME, "username")
            username.send_keys(os.getenv("INSTA_USERNAME"))
            sleep(random.uniform(2, 3))

            # enter password
            password = driver.find_element(By.NAME, "password")
            password.send_keys(os.getenv("INSTA_PASSWORD"))
            sleep(random.uniform(1.5, 2.5))

            # submit
            password.submit()
            sleep(random.uniform(4, 6))
            print("Successfully logged in to Instagram")
            return True

        except (TimeoutException, NoSuchElementException) as e:
            print(f"Login attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                print("Maximum login retries exceeded")
                raise
            time.sleep(5)  # Wait before retry


# Login
login_to_instagram()

In [None]:
def extract_shortcode(url):
    """Extract shortcode (post ID) from an Instagram URL."""
    return url.strip("/").split("/")[-1]

In [None]:
HASHTAG = "rajabets"
CSV_PATH = f"posts/{HASHTAG}.csv"
OUTPUT_DIR = f"/media/aatman/Aatman/scams-media/instagram_scrape_selenium/{HASHTAG}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def download_instagram_image(post_url, hashtag):
    """Visit post URL, check if it's an image, and download it."""
    post_id = extract_shortcode(post_url)
    filename = os.path.join(OUTPUT_DIR, f"{post_id}.jpg")

    # Skip if already downloaded
    if os.path.exists(filename):
        # print(f"Skipping {post_id} (already downloaded)")
        return

    try:
        driver.get(post_url)
        sleep(random.uniform(1, 2))

        # Try finding the image element
        image_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    # "//article//img"
                    "/html/body/div[1]/div/div/div[2]/div/div/div[1]/div[1]/div[1]/section/main/div/div[1]/div/div[1]/div/div/div/div/div/div[1]/div/div/div[1]/img",
                )
            )
        )
        image_url = image_element.get_attribute("src")

        if not image_url:
            # print(f"Could not find image for {post_id}")
            return

        # Download image
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            # print(f"Downloaded {post_id} ({hashtag})")
        else:
            print(
                f"Failed to download image for {post_id}. HTTP {response.status_code}"
            )

    except Exception:
        # print(f"Error processing {post_url}")
        pass

In [None]:
df = pd.read_csv(CSV_PATH)
print(f"Found {len(df)} posts in CSV.")

In [None]:
df.head()

In [None]:
duplicate_counts = df["url"].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
# To see total number of duplicate rows (not unique duplicates)
total_duplicate_rows = df.duplicated(subset=["url"]).sum()
print(int(total_duplicate_rows))

In [None]:
explore_mask = df["url"].str.contains("explore", case=False, na=False)
explore_count = explore_mask.sum()
print(df.loc[explore_mask, "url"].head())
print(f"Total rows containing 'explore': {explore_count}")

In [None]:
%%time
for _, row in tqdm(df.iterrows(), total=len(df), desc="Downloading posts"):
    download_instagram_image(row["url"], row["hash_tag"])

In [None]:
driver.quit()