This file includes starter code to scrape top visible comments from Instagram posts and can be adapted as needed. When scraping, please be sure to respect the platformâ€™s rate limits.

Note: Instagram may change its underlying HTML, which could cause the script to break. This version was confirmed working as of 15.01.2025.

#### Load libraries

In [1]:
import os
import json
import time
import random
from tqdm import tqdm

from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### Helper Functions

In [2]:
"""
Function to create a selenium driver.
Inputs:
    1. proxy_dict: format similar to requests library. ex: proxy_dict = {'https': f'http://{username}:{password}@{proxy}:{port}'}.
    2. headless_flag: True if mode should be headless else False. Default: False
    3. min_wait: Minimum duration (in sec) to wait. Default: 7
    4. max_wait: Maximum duration (in sec) to wait. Default: 10
"""
def create_selenium_driver(proxy_dict, headless_flag=False, min_wait=7, max_wait=15):
    selenium_options = proxy_dict
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")

    if headless_flag:
        chrome_options.add_argument("--headless")

    driver = webdriver.Chrome(
        seleniumwire_options=selenium_options, 
        options=chrome_options              
    )
    wait = WebDriverWait(driver, random.randint(min_wait, max_wait))

    return driver, wait


"""
Function to click on decline optional cookies button.
Inputs:
    1. wait: wait pointer from driver.
"""
def decline_optional_cookies(wait):
    try:
        decline_btn = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[normalize-space()='Decline optional cookies']")
            )
        )
        decline_btn.click()
    except Exception:
        print("No 'Decline optional cookies' button found or it was not clickable.")


"""
Function to click on close login popup.
Inputs:
    1. wait: wait pointer from driver.
"""
def close_login_popup(wait):
    try:
        close_div = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "svg[aria-label='Close']")
                
            )
        )
        close_div.click()
    except Exception:
        print("No login popup found or it was not clickable.")


"""
Function to click the login button to display the input fields for entering account details.
Multiple checks to handle different UX.
Inputs:
    1. wait: wait pointer from driver.
"""
def check_login_button(wait):
    try:
        login_btn = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//div[@role='button' and normalize-space()='Log In']")
            )
        )
        login_btn.click()
    except Exception:
        print("No 'Log In' button found or it was not clickable.")

    try:
        login_btn = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "/html/body/div[1]/div/div/div[2]/div/div/div[1]/div[1]/div[1]/section/main/div[1]/div[2]/div/div/div/div/div[2]/div/div[2]/div[2]/div/a")
            )
        )
        login_btn.click()
    except Exception:
        print("No 'Log In' button found or it was not clickable.")

    try:
        login_btn = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "/html/body/div[1]/div/div/div[2]/div/div/div[1]/div[1]/div[1]/section/div/div/div[2]/div/div/div/div[1]/a]")
            )
        )
        login_btn.click()
    except Exception:
        print("No 'Log In' button found or it was not clickable.")


"""
Function to enter Instagram account details.
Inputs:
    1. wait: wait pointer from driver.
    2. uname: account username or email id.
    3. pwd: account password. 
"""
def enter_account_details(wait, uname, pwd):
    try:
        username = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, 'input[aria-label="Phone number, username or email address"]')
        ))
        username.clear()
        username.send_keys(uname)
    except Exception:
        print("Username error", e)

    try:
        password = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, 'input[aria-label="Password"]')
        ))
        password.clear()
        password.send_keys(pwd)
    except Exception:
        print("Password error")


"""
Function to click on 'Log In' button after entering account details 
Inputs:
    1. wait: wait pointer from driver.
"""
def submit_account_details(wait):
    try:
        login_btn = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[@type='submit' and .//div[normalize-space()='Log in'] and not(@disabled)]")
            )
        )
        login_btn.click()
    except Exception:
        print("No 'Log In' button found or it was not clickable.")


"""
Function to click 'Not Now' button after logging in.
Inputs:
    1. wait: wait pointer from driver.
"""
def click_not_now(wait):
    try:
        dont_save = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//div[@role='button' and normalize-space()='Not now']")
            )
        )
        dont_save.click()
    except Exception:
        print("No 'Not now' button found or it was not clickable.")

#### Enter details here

In [3]:
# Note: Some proxies might not require a username or password. Then use {'https': f'http://{proxy_ip}:{proxy_port}'} 
proxy_uname = ''
proxy_pwd = '' 
proxy_ip = ''
proxy_port = 0

proxy_dict = {'https': f'http://{proxy_uname}:{proxy_pwd}@{proxy_ip}:{proxy_port}'}

# Instagram Account Details
uname = ''
pwd = ''

# Directory to save files
save_path = ''

# List of Post URLs to scrape
# URL must be of the form https://www.instagram.com/p/{url_id}/ 
# If URL is of the form https://www.instagram.com/reel/{url_id}/ or https://www.instagram.com/{username}/reel/{url_id}/, it should be converted to the above form.
urls = []

#### Main code

In [None]:
# Starting URL. Choose one from the two below and test.
# Some accounts will work with the first and some with the second depending on which browser the account was created on.
url = urls[0]
# url = 'https://www.instagram.com/accounts/login/'

driver, wait = create_selenium_driver(proxy_dict)
driver.get(url)
decline_optional_cookies(wait)

time.sleep(random.randint(2, 3))
check_login_button(wait)

time.sleep(random.randint(2, 5))
enter_account_details(wait, uname, pwd)

time.sleep(random.randint(2, 5))
submit_account_details(wait)

time.sleep(random.randint(2, 5))
enter_account_details(wait, uname, pwd)

time.sleep(random.randint(2, 5))
submit_account_details(wait)
decline_optional_cookies(wait)

time.sleep(random.randint(2, 5))
click_not_now(wait)
decline_optional_cookies(wait)

time.sleep(random.randint(15, 25))

time_now = time.strftime("%Y-%m-%d %H:%M:%S")

for url in urls:
    url_id = url.split('/')[-2]

    if os.path.exists(f'{save_path}/{url_id}_comments.json'):
        print(f"File already exists")
        continue

    # open new URL in new tab
    driver.switch_to.new_window('tab')
    driver.get(url)
    decline_optional_cookies(wait)

    try:
        # Collect comments using 'class' attribute. Note: Collected texts also include caption and number of likes of the post along with the comments.
        classes = "x1lliihq x1plvlek xryxfnj x1n2onr6 xyejjpt x15dsfln x193iq5w xeuugli x1fj9vlw x13faqbe x1vvkbs x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x1i0vuye xvs91rp xo1l8bm x5n08af x10wh9bi xpm28yp x8viiok x1o7cslx"
        selector = "span." + ".".join(classes.split())
        content = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )

        content = [text.text for text in content]

        # save collected comments
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        with open(f'{save_path}/{url_id}_comments.json', 'w') as f:
            json.dump(content, f)


        # Collect timestamps of the comments using 'class' attribute. These timestamps can be mapped to the comments after removing the caption and number of likes.
        classes = "x1ejq31n x18oe1m7 x1sy0etr xstzfhl x1roi4f4 xexx8yu xyri2b x18d9i69 x1c1uobl x1n2onr6"
        selector = "time." + ".".join(classes.split())
        timestamp = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))
        )
        timestamp = [t.get_attribute('datetime') for t in timestamp]

        # save collected timestamps
        with open(f'{save_path}/{url_id}_timestamps.json', 'w') as f:
            json.dump(timestamp, f)

    except Exception as e:
        print(f"Error extracting comments for {url}", e)

    # close this URL
    driver.close()

    # switch to first tab
    if driver.window_handles:
        driver.switch_to.window(driver.window_handles[0])

    time.sleep(random.randint(3, 8))

driver.quit()