In [8]:
import pandas as pd
import asyncio
import os
from datetime import datetime
from tqdm.notebook import tqdm
import random

In [9]:
df = pd.read_csv('data.csv')
fb_handles = df[df['platform'] == 'Facebook']['handle'].to_list()
len(fb_handles)

551

In [10]:
# import asyncio
# from playwright.async_api import async_playwright

# async def scrape_facebook_like_number(username):
#     url = f"https://www.facebook.com/{username}/"

#     async with async_playwright() as p:
#         browser = await p.chromium.launch(headless=True)
#         page = await browser.new_page(user_agent=
#             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
#             "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
#         )

#         await page.goto(url, wait_until="networkidle")

#         try:
#             await page.locator('div[aria-label="Close"]').first.click()
#         except:
#             pass

#         strongs = page.locator("strong.html-strong")
#         strong_texts = await strongs.all_text_contents()

#         await browser.close()

#         for s in strong_texts:
#             s_clean = s.strip()
#             if s_clean.replace("K","").replace("M","").replace(".", "").isdigit():
#                 return s_clean

#         return None


# result = await scrape_facebook_like_number("DunleavyGovernor")
# result


In [None]:
from playwright.async_api import async_playwright

def load_facebook_cache(cache_file='facebook_cache.csv'):
    if os.path.exists(cache_file):
        try:
            cache_df = pd.read_csv(cache_file)
            print(f"‚úì Loaded {len(cache_df)} cached profiles from {cache_file}")
            return cache_df
        except Exception as e:
            print(f"Warning: Could not load cache file: {e}")
            return pd.DataFrame()
    else:
        print(f"No cache file found. Will create {cache_file}")
        return pd.DataFrame()

def save_facebook_cache(data, cache_file='facebook_cache.csv'):
    try:
        df = pd.DataFrame(data)
        df.to_csv(cache_file, index=False)
        print(f"‚úì Saved {len(df)} profiles to {cache_file}")
    except Exception as e:
        print(f"Warning: Could not save cache: {e}")

async def scrape_facebook_like_number(username):
    url = f"https://www.facebook.com/{username}/"

    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page(user_agent=
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
            )

            await page.goto(url, wait_until="networkidle", timeout=30000)

            try:
                await page.locator('div[aria-label="Close"]').first.click(timeout=2000)
            except:
                pass

            strongs = page.locator("strong.html-strong")
            strong_texts = await strongs.all_text_contents()

            await browser.close()

            for s in strong_texts:
                s_clean = s.strip()
                if s_clean.replace("K","").replace("M","").replace(".","").replace(",","").isdigit():
                    return s_clean

            return None
    except Exception as e:
        print(f"Error scraping {username}: {e}")
        return None

async def scrape_all_facebook_pages(handles, cache_file='facebook_cache.csv', use_cache=True, delay_min=2, delay_max=10):
    # Load cache
    cache_df = load_facebook_cache(cache_file) if use_cache else pd.DataFrame()
    
    results = []
    cached_handles = set(cache_df['username'].tolist()) if not cache_df.empty and 'username' in cache_df.columns else set()
    
    # Add cached data to results
    if use_cache and not cache_df.empty:
        for handle in handles:
            if handle in cached_handles:
                cached_row = cache_df[cache_df['username'] == handle].iloc[0].to_dict()
                results.append(cached_row)
    
    # Get handles that need to be fetched
    handles_to_fetch = [h for h in handles if h not in cached_handles]
    
    if handles_to_fetch:
        print(f"\nüìä Need to fetch {len(handles_to_fetch)} new profiles")
        print(f"‚úÖ Using {len(cached_handles)} cached profiles\n")
    else:
        print(f"\n‚úÖ All {len(handles)} profiles loaded from cache!")
        return results
    
    # Fetch new data with progress bar
    for handle in tqdm(handles_to_fetch, desc="Scraping Facebook pages"):
        like_count = await scrape_facebook_like_number(handle)
        
        data = {
            'username': handle,
            'like_count': like_count,
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        
        results.append(data)
        
        if like_count:
            tqdm.write(f"‚úì {handle}: {like_count} likes")
        else:
            tqdm.write(f"‚úó {handle}: Failed to retrieve data")
        
        # Save cache after each successful fetch
        save_facebook_cache(results, cache_file)
        
        # Add random delay between requests
        if handle != handles_to_fetch[-1]:
            delay = random.uniform(delay_min, delay_max)
            tqdm.write(f"‚è±Ô∏è  Waiting {delay:.1f} seconds...")
            await asyncio.sleep(delay)
    
    return results

In [None]:
fb_data = await scrape_all_facebook_pages(
    fb_handles, 
    cache_file='facebook_cache.csv', 
    use_cache=True,
    delay_min=2,  
    delay_max=5
)

‚úì Loaded 182 cached profiles from facebook_cache.csv

üìä Need to fetch 369 new profiles
‚úÖ Using 182 cached profiles



Scraping Facebook pages:   0%|          | 0/369 [00:00<?, ?it/s]

‚úó RepSchneider: Failed to retrieve data
‚úì Saved 183 profiles to facebook_cache.csv
‚è±Ô∏è  Waiting 2.3 seconds...
‚úó OfficialRepDannyDavis: Failed to retrieve data
‚úì Saved 184 profiles to facebook_cache.csv
‚è±Ô∏è  Waiting 8.3 seconds...
‚úó OfficialRepDannyDavis: Failed to retrieve data
‚úì Saved 184 profiles to facebook_cache.csv
‚è±Ô∏è  Waiting 8.3 seconds...


CancelledError: 

In [None]:
# Create DataFrame from results
df_facebook = pd.DataFrame(fb_data)
df_facebook