# Get Post List
Given a list of accounts, get the links for the first 24 posts from the profile.

In [50]:
# Imports and Helper Functions

import csv
from datetime import datetime
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def login_to_instagram(driver, username, password):
    driver.get('https://www.instagram.com/accounts/login/')
    time.sleep(8)
    
    # Locate the username and password fields and enter your credentials
    username_input = driver.find_element("name", "username")
    password_input = driver.find_element("name", "password")
    
    username_input.send_keys(username)
    password_input.send_keys(password)
    
    # Press the login button
    login_button = driver.find_element("xpath", '//*[@id="loginForm"]/div/div[3]/button')
    login_button.click()
    
    print(f"Successfully logged in using {username} account!")
    time.sleep(8)

# Function to navigate to the profile page
def navigate_to_profile(driver, account_name):
    try:
        driver.get(f"https://www.instagram.com/{account_name}/")
        print(f"Navigated to profile: {account_name}")
        time.sleep(random.uniform(3,5))
    except Exception as e:
        print(f"Error navigating to {account_name}: {e}")

def check_account_private(account_name, driver):
    tracker_file = "post_link_tracker.csv"
    
    try:
        # Locate the "This account is private" element
        account_priv_element = driver.find_element(By.XPATH, 
            "//span[contains(text(), 'This account is private')]")
        account_priv_message = account_priv_element.text

        if account_priv_message == "This account is private":
            print("This account is private. Skipping.")

            # Check if file exists, create with header if not
            file_exists = os.path.exists(tracker_file)
            with open(tracker_file, mode='a', encoding='utf-8', newline='') as csvfile:
                writer = csv.writer(csvfile)
                if not file_exists:
                    writer.writerow(["Account Name", "Timestamp", "Comments"])  # Write header if new file
                writer.writerow([account_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Account is Private"])
                print(f"{account_name} is private, added to {tracker_file}")

            return True
        
    except Exception as e:
        print(f"Account should be public.")
    
    return False

In [None]:
def get_top_links(driver, account_name):
    csv_filepath = f"post_links/{account_name}_post_links.csv"

    # Load existing reel URLs from the CSV to avoid duplicates
    links_found = set()
    total_urls = 0  # Counter to keep track of the total URLs read

    if os.path.exists(csv_filepath):
        print(f"CSV for {account_name} exists: {csv_filepath}. Reading in existing URLs.")
        with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)
            for row in reader:
                links_found.add(row[0])
                total_urls += 1  # Increment the counter for each URL read
            print(f"{total_urls} found for {account_name} from CSV.")
    else:
        navigate_to_profile(driver, account_name)
        if check_account_private(account_name, driver):
            return
    
        print(f"No existing CSV for {account_name} found. Creating CSV.")
        with open(csv_filepath, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames = ["url"])
            writer.writeheader()

    if total_urls >= 24:
        print(f"{total_urls} total links already found for {account_name} in csv. Skipping.")
        with open("post_link_tracker.csv", mode='a', encoding='utf-8', newline='') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([account_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f"{total_urls} urls collected"])
        return
    
    navigate_to_profile(driver, account_name)

    with open(csv_filepath, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//a[contains(@href, "/p/") or contains(@href, "/reel/")]')))
        except TimeoutException:
            print("Timeout while waiting for initial post links.")
            return

        # Variables for scrolling control
        scroll_increment = 3000  # Scroll increment
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        last_reel_count = len(driver.find_elements(By.XPATH, '//a[contains(@href, "/p/") or contains(@href, "/reel/")]'))

        # Continue scrolling until reaching the bottom of the page
        retry_count = 0
        max_retries = 3  
        new_links_found = False

        while total_urls < 24:
            # Find all reel links on the page
            posts_found = driver.find_elements(By.XPATH, '//a[contains(@href, "/p/") or contains(@href, "/reel/")]')
            current_post_count = len(posts_found)
            print(f"Found {current_post_count} links on webpage, {total_urls} collected in running total.")

            for post in posts_found:
                post_url = post.get_attribute('href')
                
                # If it's a new link (not in CSV), add it to the CSV and the set
                if post_url not in links_found:
                    links_found.add(post_url)  # Add to running set
                    writer.writerow([post_url])    # Write directly to CSV
                    new_links_found = True
                    print(f"New link added to CSV: {post_url}")
                    total_urls += 1

                    if total_urls == 24:
                        with open("post_link_tracker.csv", mode='a', encoding='utf-8', newline='') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([account_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f"{total_urls} urls collected"])
                        print(f"{total_urls} urls found for {account_name}. Updated Tracker, Moving to next profile.")
                        return

            # Scroll down to load more reels
            driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_increment)
            time.sleep(3)  # Pause to allow new content to load
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            print(f"Just tried scrolling: last height: {last_height}  new height: {new_height} (should be different)")

            if new_height == last_height:
                time.sleep(1)
                driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_increment)
                time.sleep(2)  # Pause to allow new content to load
                # Check the new scroll height and compare it to the last height
                new_height = driver.execute_script("return document.documentElement.scrollHeight")
                print(f"Just tried scrolling: last height: {last_height}  new height: {new_height} (should be different)")

            # If no new links are found, attempt to scroll again
            if not new_links_found:
                print(f"No new reel links found, trying again... last height: {last_height}  new height: {new_height}")

            # If new height is equal to the last height and no new links were found, stop scrolling
            if new_height == last_height and current_post_count == last_reel_count:
                retry_count += 1
                print(f"Attempt {retry_count}/{max_retries}: No new content loaded. Stopping.")
                if retry_count >= max_retries:
                    print(f"Max retries reached. Stopping the scraping process.")
                    with open("post_link_tracker.csv", mode='a', encoding='utf-8', newline='') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([account_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f"{total_urls} urls collected"])
                    print(f"{total_urls} urls found for {account_name}. Updated Tracker, Moving to next profile.")
                    break

            # Update last height and last reel count for the next iteration
            last_height = new_height
            last_reel_count = current_post_count
            print(f"Updating scroll height for this attempt: last height: {last_height}  new height: {new_height}")
    

In [47]:
import pandas as pd
account_names_df = pd.read_csv("aggregated_account_counts.csv")
account_names_list = list(account_names_df['Account Name'])

In [98]:
len(account_names_list)

10842

In [109]:
def multiple_profiles(account_names_list, login_user):
    tracker_file = "post_link_tracker.csv"

    visited_profiles = []
    if os.path.exists(tracker_file) and os.path.getsize(tracker_file) > 0:
        with open(tracker_file, mode="r", encoding="utf-8") as file:
            csv_reader = csv.DictReader(file)
            visited_profiles = [row["Account Name"] for row in csv_reader if "Account Name" in row and row["Account Name"]]

    else:
        with open(tracker_file, mode='w', encoding='utf-8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Account Name", "Timestamp", "Comments"])  # Write header if file doesn't exist

    driver = webdriver.Chrome()
    login_to_instagram(driver, login_user, 'wellesley')  
    time.sleep(5)

    for account_name in account_names_list:
        print(f"----- Processing {account_name} -----")
        if account_name in visited_profiles:
            print(f"Already collected lists from {account_name}. Skipping")
            continue
        get_top_links(driver, account_name)

    driver.quit()

In [113]:
multiple_profiles(account_names_list[10:], "bingbongding2024")

Successfully logged in using bingbongding2024 account!
----- Processing lavina.hk -----
Already collected lists from lavina.hk. Skipping
----- Processing catharina.cheung -----
Already collected lists from catharina.cheung. Skipping
----- Processing supwayhk -----
Already collected lists from supwayhk. Skipping
----- Processing dji_hkmacau -----
Already collected lists from dji_hkmacau. Skipping
----- Processing thepeninsulaboutique -----
Already collected lists from thepeninsulaboutique. Skipping
----- Processing doubleducks_official -----
Already collected lists from doubleducks_official. Skipping
----- Processing andrechiang_sg -----
Already collected lists from andrechiang_sg. Skipping
----- Processing lea.cantalloube.hk -----
Already collected lists from lea.cantalloube.hk. Skipping
----- Processing lighterluxe -----
Already collected lists from lighterluxe. Skipping
----- Processing dunfallandy.house -----
Already collected lists from dunfallandy.house. Skipping
----- Processing 

KeyboardInterrupt: 

# Code Archive