In [1]:
# ------------------- IMPORT LIBRARIES ------------------------------------
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import csv
import logging
import os
import time
import random
from datetime import datetime

In [2]:
# ------------------- GLOBAL CONSTANTS ------------------------------------
GOOGLE_URL = 'https://www.google.com/'

In [3]:
# -------- SET UP SELENIUM WEBDRIVER (FOR CHROME AUTOMATION) -----------------
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)
    chrome_options.add_argument("--disable-blink-features=AutomationControlled") # Disable bot detection mechanisms
    chrome_options.add_argument("--incognito") # Incognito mode
    # chrome_options.add_argument("--headless")

    # List of popular user agents (Chrome, Firefox, Safari, Edge)
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59'
    ]
    # Randomly select a user agent
    selected_user_agent = random.choice(user_agents)
    chrome_options.add_argument(f"user-agent={selected_user_agent}")
    
    return webdriver.Chrome(options=chrome_options) # Returns an instance of webdriver.Chrome with specified options-
    

In [4]:
# ------------------------- MIMIC HUMAN BEHAVIOR --------------------------
def random_delay(min_delay, max_delay):
    delay = random.uniform(min_delay, max_delay)
    # print(f"Waiting for {round(delay, 2)} seconds to mimic human behavior...")
    # print(f"Waiting {min_delay} - {max_delay} seconds to mimic human behavior...")
    time.sleep(delay)


In [5]:
# ---------------------- GOOGLE SEARCH FUNCTION ---------------------------
def google_search(driver, query, num_pages, category, country):
    links_collected = []
    try:
        # Clear cookies and perform initial search
        driver.delete_all_cookies()
        driver.get(GOOGLE_URL)
        random_delay(2, 5) # Random delay before searching

        # Perform search
        try:
            search_box = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, "q")))
            random_delay(15, 25)  # Pause before typing the search query
            
            # Type each character with a short delay to simulate human typing
            search_box.send_keys(query)
                
            
            search_box.send_keys(Keys.RETURN) # Press Enter to perform the search
            print("Google search performed")
        except TimeoutException:
            print("Failed to locate the Google search box.")
            return []
        
        # Scroll after initial page load to simulate user browsing behavior
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight * {});".format(random.uniform(0.3, 0.9)))
        random_delay(2, 4)  # Brief pause after scrolling

        # Process each page sequentially
        for page in range(num_pages):
            random_delay(2, 4) # random delay between pages to simulate human behavior when going through search results.
            
            # Scroll again while on the current page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight * {});".format(random.uniform(0.3, 0.9)))
            random_delay(2, 4)  # Pause after scrolling
            
            print(f"\nProcessing page {page + 1}")
            
            # Fetch search results
            try:
                # Extract links from current page
                search_results = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "cite.qLRx3b")))
                links = [result.text for result in search_results if "instagram.com" in result.text]
                cleaned_links = clean_list(links)
                links_collected.extend(cleaned_links)
                
                print(f"Collected {len(cleaned_links)} cleaned links from page {page + 1}")
                store_in_csv(cleaned_links, category, country)
                
                # Try to go to next page
                try:
                    next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "pnnext")))
                    random_delay(10, 15)  # Pause before clicking the "Next" button
                    next_button.click()
                except TimeoutException:
                    print(f"No more pages after page {page + 1}")
                    break
                             
            except Exception as e:
                print(f"Error processing page {page + 1}: {e}")
                break
            
    except Exception as e:
        print(f"An error occurred: {e}")
    
    return links_collected

In [6]:
# -------------------- CLEANING LINKS FUNCTION ---------------------------
def clean_list(links):
    """Clean and filter the links from a single page"""
    cleaned_links = []
    
    for link in links:
        try:
            # Split the link and take the part after 'instagram.com › ' and before any further directories (like /Post)
            username = link.split(' › ')[1].split(' › ')[0].split('/')[0]
            cleaned_links.append(username)
        except Exception as e:
            print(f"Error cleaning link: {e}")
            continue

    # Remove duplicates by converting to set and back to list
    cleaned_links = list(set(cleaned_links))
    
    return cleaned_links   

In [7]:
# ------------------ FUNCTION TO STORE IN CSV ----------------------------
def store_in_csv(links, category, country, filename='instagram_usernames2.csv'):
    """Store links from a single page in CSV with append mode"""
    
    try:
        # Check if file exists to determine if we need to write headers
        file_exists = os.path.isfile(filename)
        mode = 'a' if file_exists else 'w'  
    
        with open(filename, mode=mode, newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            
            # Write headers if this is a new file
            if not file_exists:
                writer.writerow(['username', 'Category', 'Country'])  # Add header row
            
            # Write links from this page
            for link in links:
                writer.writerow([link, category, country])
        
        print(f"Saved {len(links)} links to {filename}")    
        
    except Exception as e:
        print(f"Error saving to CSV: {e}")



In [8]:
def generate_dork_queries(categories, countries):
    queries = []
    for category in categories:
        for country in countries:
            base_query = 'site:instagram.com'
            keywords = '("ecommerce" OR "tienda online" OR "online shop" OR "shop" OR "tienda virtual" OR "distribuidores")'
            query = f'{base_query} {category} AND {keywords} AND {country} -inurl:explore'
            # site:instagram.com inurl:"/accounts/"
            queries.append((query, category, country))
    return queries


In [9]:
# ------------------- MAIN SCRAPING LOGIC --------------------------------
def main():
    logging.basicConfig(filename='scraper.log', level=logging.INFO)
    
    # Categories with SEO-optimized keywords grouped by OR operators
    categories = [
        '"ropa"',
        '"calzado"',
        '("accesorios" OR "joyería" OR "relojes")',
        '"bolsos"',
        '"cuidado * piel"',
        '("belleza" OR "maquillaje" OR "cosméticos")',
        '("perfumes" OR "fragancias")',
        '("accesorios * smartphones" OR "tecnología" OR "dispositivos")',
        '("ropa deportiva" OR "fitness")'
        ]
    
    # Extended list of Spanish-speaking countries and regions
    countries = [
    '("España" OR "Espana" OR "Madrid")',
    '("México" OR "Mexico" OR "mx")',
    '("Colombia" OR "Bogotá" OR "Col")',
    '("Argentina" OR "Buenos Aires" OR "Arg")',
    '("Chile" OR "Santiago" OR "Cl")',
]
    
    # Generate dork queries for e-commerce categories and countries
    queries = generate_dork_queries(categories, countries)
    
    driver = setup_webdriver()
    
    try:        
        # Loop until all queries are processed
        while queries:
            query_tuple = random.choice(queries)
            queries.remove(query_tuple)
            
            search_query, category, country = query_tuple
            logging.info(f"Performing search with query: {search_query}")
            
            # Perform the Google search
            google_search(driver, search_query, 6, category, country)
            random_delay(5, 10)  # Longer delay between different search queries

    except Exception as e:
        logging.error(f"An error occurred during scraping: {e}")
    
    finally:
        # Ensure the browser is closed
        # driver.quit()
        print("Work done")


In [10]:
if __name__ == "__main__":
    main()

Google search performed

Processing page 1
Collected 8 cleaned links from page 1
Saved 8 links to instagram_usernames2.csv

Processing page 2
Collected 10 cleaned links from page 2
Saved 10 links to instagram_usernames2.csv

Processing page 3
Collected 8 cleaned links from page 3
Saved 8 links to instagram_usernames2.csv

Processing page 4
Collected 7 cleaned links from page 4
Saved 7 links to instagram_usernames2.csv

Processing page 5
Collected 9 cleaned links from page 5
Saved 9 links to instagram_usernames2.csv

Processing page 6
Collected 7 cleaned links from page 6
Saved 7 links to instagram_usernames2.csv
Google search performed

Processing page 1
Collected 10 cleaned links from page 1
Saved 10 links to instagram_usernames2.csv

Processing page 2
Collected 10 cleaned links from page 2
Saved 10 links to instagram_usernames2.csv

Processing page 3
Collected 10 cleaned links from page 3
Saved 10 links to instagram_usernames2.csv

Processing page 4
Collected 10 cleaned links from pa

KeyboardInterrupt: 