In [1]:
# === User-Agent Rotation ===
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
import json
import random
import time
import traceback
from fake_useragent import UserAgent
import requests
from selenium.webdriver.common.proxy import Proxy, ProxyType

In [2]:
def get_random_user_agent():
    ua = UserAgent()
    return ua.random

def load_cookies_from_file(driver, path):
    with open(path, 'r') as cookiesfile:
        cookies = json.load(cookiesfile)
        for cookie in cookies:
            if 'domain' in cookie and cookie['domain'].startswith('.'):
                cookie['domain'] = cookie['domain'][1:]
            if 'sameSite' in cookie and cookie['sameSite'] not in ["Strict", "Lax", "None"]:
                del cookie['sameSite']
            driver.add_cookie(cookie)

def save_cookies_to_file(driver, path):
    with open(path, 'w') as cookiesfile:
        json.dump(driver.get_cookies(), cookiesfile)

def simulate_user_interaction(driver):
    """Simulate human-like mouse movements and scrolling."""
    action = ActionChains(driver)
    action.move_by_offset(random.randint(1, 300), random.randint(1, 300)).perform()  # Random mouse movement
    driver.execute_script("window.scrollBy(0, 300);")  # Scroll slightly
    time.sleep(random.uniform(1, 3))  # Random delay

def collect_hashtag_posts(driver, hashtag, num_posts=10):
    driver.get(f"https://www.instagram.com/explore/search/keyword/?q={hashtag}")
    time.sleep(15)
    print("Page loaded")

    unique_posts = set()
    count = 0
    while len(unique_posts) < num_posts:
        current_posts = len(unique_posts)

        for i in range(3):
            # simulate_user_interaction(driver)  # Simulate user behavior during scrolling
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(2, 5))  # Random delay after each scroll
            print(f"Scrolled {i+1} times")

        post_links = driver.find_elements(By.TAG_NAME, 'a')
        print(f"Found {len(post_links)} links")

        post_links = list(set(post_links))  # Remove duplicates

        for link in post_links:
            href = link.get_attribute('href')
            if href and '/p/' in href:
                if len(unique_posts) >= num_posts:
                    break
                unique_posts.add(href)

        if len(unique_posts) == current_posts:
            count += 1
        else:
            count = 0

        if count == 5:
            break

        print(f"Found {len(unique_posts)} unique posts")

    unique_posts_json = json.dumps(list(unique_posts), indent=4)
    with open(f'instagram_posts_urls_{len(unique_posts)}.json', 'w') as json_file:
        json_file.write(unique_posts_json)
    print("Count:", count)
    return unique_posts

def get_proxies():
    response = requests.get('https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=socks4&proxy_format=protocolipport&format=text&timeout=20000')
    proxies = response.text.strip().split('\n')
    return proxies

def set_proxy(options, proxy):
    proxy = proxy.replace('socks4://', '')
    options.add_argument(f'--proxy-server=socks4://{proxy}')

def extract_post_metadata(driver, post_url):
    driver.get(post_url)
    time.sleep(5)

    metadata = {}
    try:
        # Check for restricted content warning
        try:
            restricted_warning = driver.find_element(By.XPATH, "//span[contains(text(), 'Restricted profile')]")
            if restricted_warning:
                print(f"Restricted content warning found for {post_url}. Skipping...")
                return None
        except NoSuchElementException:
            pass

        # Check for "page isn't available" message
        try:
            page_not_available = driver.find_element(By.XPATH, "//span[contains(text(), \"Sorry, this page isn't available.\")]")
            if page_not_available:
                print(f"Page not available message found for {post_url}. Skipping...")
                return None
        except NoSuchElementException:
            pass

        # Try to extract the date posted
        date_element = driver.find_element(By.XPATH, "//time[@class='x1p4m5qa']")
        metadata['date_posted'] = date_element.get_attribute('datetime')

    except Exception as e:
        print(f"An error occurred while extracting metadata from {post_url}: {e}")
        traceback.print_exc()

    return metadata

def scrape_metadata(driver, post_urls, output_file='instagram_posts_metadata.json'):
    # Load existing data from the JSON file
    try:
        with open(output_file, 'r') as json_file:
            print("Loading existing metadata...")
            data = json.load(json_file)
    except (FileNotFoundError, json.JSONDecodeError):
            print("No existing metadata found")
            data = []

    if data:
        # Remove metadata entries without a 'date_posted' field
        data = [metadata for metadata in data if metadata['date_posted']]
    
    for url in post_urls:
        if any(metadata['url'] == url for metadata in data):
            print(f"Metadata for {url} already exists. Skipping...")
            continue
        if len(data) % 10 == 0 and len(data) > 0:
            wait_time = random.randint(30, 300)
            print(f"Pausing for {wait_time} seconds...")
            time.sleep(wait_time)

        metadata = extract_post_metadata(driver, url)

        if metadata is None:  # If metadata is None, skip this post
            continue

        if not metadata:  # If metadata is empty, wait and retry randomly from 1 to 3 hours
            wait_time = random.randint(3600, 10800)
            print(f"No metadata found for {url}. Waiting {wait_time} seconds before retrying...")
            time.sleep(wait_time)
            metadata = extract_post_metadata(driver, url)  # Retry after an hour

        metadata['url'] = url
        data.append(metadata)

        # Save metadata to file after each post
        with open(output_file, 'w') as json_file:
            json.dump(data, json_file, indent=4)

    return data

if __name__ == "__main__":
    HASHTAG = 'lashartist'
    service = Service(r'E:\Tools\chromedriver-win64\chromedriver-win64\chromedriver.exe')

    # Fetch proxies
    proxies = get_proxies()
    proxy_index = 0

    # Chrome options with random User-Agent
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)
    options.add_argument(f"user-agent={get_random_user_agent()}")  # Rotate User-Agent
    options.binary_location = r'E:\Tools\chrome-win64\chrome-win64\chrome.exe'

    # Session 1: Scrape post URLs with loaded cookies




    # Session 2: Scrape post metadata without loading cookies

    driver = webdriver.Chrome(service=service, options=options)

    try:
        with open('instagram_posts_urls_10000.json', 'r') as json_file:
            post_urls = json.load(json_file)
        metadata_list = scrape_metadata(driver, post_urls)

        metadata_json = json.dumps(metadata_list, indent=4)
        print(metadata_json)
        
        with open('instagram_posts_metadata.json', 'w') as json_file:
            json_file.write(metadata_json)

    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()

    finally:
        driver.quit()

    # Session 3: Check if there are empty metadata entries and retry, if necessary

    # driver = webdriver.Chrome(service=service, options=options)

    # try:
    #     with open('instagram_posts_metadata.json', 'r') as json_file:
    #         metadata_list = json.load(json_file)

    #     # Identify empty metadata entries based on the 'date_posted' field
    #     empty_metadata = [metadata for metadata in metadata_list if not metadata['date_posted']]
    #     print(f"Found {len(empty_metadata)} empty metadata entries")

    #     if empty_metadata:
    #         post_urls = [metadata['url'] for metadata in empty_metadata]
    #         retried_metadata_list = scrape_metadata(driver, post_urls)

    #         # Remove empty metadata entries from the original list
    #         # metadata_list = [metadata for metadata in metadata_list if metadata['date_posted']]

    #         # Add retried metadata to the list
    #         # metadata_list.extend(retried_metadata_list)

    #         # metadata_json = json.dumps(metadata_list, indent=4)
    #         # print(metadata_json)

    #         # with open('instagram_posts_metadata.json', 'w') as json_file:
    #         #     json_file.write(metadata_json)
    # except Exception as e:
    #     print(f"An error occurred: {e}")
    #     traceback.print_exc()

    # finally:
    #     driver.quit()


Loading existing metadata...
An error occurred: 'date_posted'


Traceback (most recent call last):
  File "C:\Users\Alex Hoang\AppData\Local\Temp\ipykernel_9616\597891650.py", line 182, in <module>
    metadata_list = scrape_metadata(driver, post_urls)
  File "C:\Users\Alex Hoang\AppData\Local\Temp\ipykernel_9616\597891650.py", line 125, in scrape_metadata
    data = [metadata for metadata in data if metadata['date_posted']]
  File "C:\Users\Alex Hoang\AppData\Local\Temp\ipykernel_9616\597891650.py", line 125, in <listcomp>
    data = [metadata for metadata in data if metadata['date_posted']]
KeyError: 'date_posted'
