# Profile Information Scraper
Data Collected:
* name
* account country
* date joined
* verified date (if applicable)
* meta description for further parsing (# followers, # following, # posts, bio)

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import re

### Helper functions

In [None]:
def login_to_instagram(driver, username, password):
    driver.get('https://www.instagram.com/accounts/login/')
    time.sleep(3)
    
    # Locate the username and password fields and enter your credentials
    username_input = driver.find_element("name", "username")
    password_input = driver.find_element("name", "password")
    
    username_input.send_keys(username)
    password_input.send_keys(password)
    
    # Press the login button
    login_button = driver.find_element("xpath", '//*[@id="loginForm"]/div/div[3]/button')
    login_button.click()
    
    print("Successfully logged in!")
    time.sleep(5)

# Function to navigate to the profile page
def navigate_to_url(driver, account_url):
    try:
        driver.get(account_url)
        print(f"Navigated to url: {account_url}")
        time.sleep(3)
    except Exception as e:
        print(f"Error navigating to {account_url}: {e}")

# Function to get account information
def get_profile_info(driver):
    profile_info = {}
    wait = WebDriverWait(driver, 5)

    try:
        # Likes, followers, following, bio (from meta tag)
        meta_description = driver.find_element(By.XPATH, "//meta[@name='description']").get_attribute("content")
        profile_info["meta_info"] = meta_description
    except NoSuchElementException:
        profile_info["meta_info"] = "NA"
        print("Meta description not found.")

    try:
        # Get profile name
        profile_name_element = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/div/div[2]/div/div/div[1]/div[2]/div/div[1]/section/main/div/header/section[2]/div/div[1]/div[1]/div/a/h2/span')
        profile_info["profile_name"] = profile_name_element.text
        profile_name_element.click()
        time.sleep(3)
    except NoSuchElementException:
        profile_info["profile_name"] = "NA"
        print("Profile name element not found.")
    except Exception as e:
        print(f"Error while retrieving profile name: {e}")

    try:
        # Get date joined
        date_joined_element = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Date joined']")))
        profile_info["date_joined"] = date_joined_element.text.split("\n")[1]
    except (NoSuchElementException, TimeoutException):
        profile_info["date_joined"] = "NA"
        print("Date joined element not found or took too long to load.")

    try:  
        account_country_element = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Account based in']")))
        profile_info["account_country"] = account_country_element.text.split("\n")[1]
    except (NoSuchElementException, TimeoutException):
        profile_info["account_country"] = "NA"
        print("Account country element not found or took too long to load.")

    # Try to find verified date
    try:
        verified_date_element = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Verified']")))
        profile_info["verified_date"] = verified_date_element.text.split("\n")[1]
    except (NoSuchElementException, TimeoutException):
        profile_info["verified_date"] = "NA"
        print("Verified date element not found or took too long to load.")

    return profile_info


In [None]:
# For parsing meta description
def count_to_numeric(count_string):
    """Converts follower and following counts from string to numeric format."""
    if 'K' in count_string:
        return int(count_string.replace('K', '')) * 1000
    elif 'M' in count_string:
        return int(count_string.replace('M', '')) * 1000000
    else:
        return int(count_string)

def parse_meta_description(meta_description):
    """
    Parses the Instagram meta description to extract follower count, following count,
    post count, profile name, profile username, and bio.
    """
    # Regular expression to parse the meta description
    regex = r"(\d+(?:K|M)?) Followers, (\d+(?:K)?) Following, (.*?) Posts - (.*?) \(@(.*?)\) on Instagram: \"([\s\S]*?)\""
    match = re.search(regex, meta_description)

    # Debugging output
    if not match:
        print("Regex did not match. Check the pattern and input string.")
        return None
    
    # If we have a match, extract the groups
    followers = match.group(1)  # Followers count
    following = match.group(2)   # Following count
    posts = match.group(3).replace(',', '')  # Post count, remove commas
    profile_name = match.group(4).strip()  # Profile name
    profile_username = match.group(5).strip()  # Username
    bio = match.group(6).strip().replace('\n', ' ')  # Bio text, remove new lines
    
    # Convert counts to numeric values
    followers_num = count_to_numeric(followers)
    following_num = count_to_numeric(following)
    posts_num = int(posts)

    return {
        "num_followers": followers_num,
        "num_following": following_num,
        "num_posts": posts_num,
        "profile_name": profile_name,
        "profile_username": profile_username,
        "bio": bio
    }

In [None]:
# Function to process multiple URLs
def multiple_profile_info(driver, reel_urls):
    results = []
    missing_info = []
    for reel_url in reel_urls:
        navigate_to_url(driver, reel_url)
        profile_info = get_profile_info(driver)
        results.append(profile_info)

        # Check if any of the fields except for 'verified_date' are 'NA'
        if any(value == "NA" for key, value in profile_info.items() if key != "verified_date"):
            missing_info.append(reel_url)  # Add URL to missing_info list if any field (excluding 'verified_date') is NA
    
    return results, missing_info
