## Chunk Scrapping

In [11]:
import time
import random
import traceback
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException

# --- Setup undetected Chrome ---
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")

# Launch browser
driver = uc.Chrome(options=options)

# --- Step 1: Manual Login ---
driver.get("https://www.instagram.com/accounts/login/")
print("üîê Please log in manually in the browser. Then come back here and press ENTER.")
input()

# --- Step 2: Visit Hashtag Page ---
hashtag = "upsc"
driver.get(f"https://www.instagram.com/explore/tags/{hashtag}/")
time.sleep(random.uniform(5, 7))

# --- Step 3: Scroll and collect post links ---
post_links = set()
print("üîÑ Scrolling through #upsc posts...")
for _ in range(10):  # simulate scrolling
    links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
    for link in links:
        post_links.add(link.get_attribute("href"))
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(random.uniform(3, 5))

print(f"‚úÖ Collected {len(post_links)} post links.\n")

# --- Step 4: Visit each post and extract usernames ---
usernames = set()
posts_to_check = list(post_links)[:20]  # limit for safety

print(f"üîç Visiting {len(posts_to_check)} posts to extract usernames...")

for i, post_url in enumerate(posts_to_check):
    try:
        driver.get(post_url)
        time.sleep(random.uniform(5, 8))  # delay to avoid detection

        try:
            user_elem = driver.find_element(By.XPATH, "//header//a[contains(@href, '/') and not(contains(@href, '/p/'))]")
        except NoSuchElementException:
            user_elem = driver.find_element(By.XPATH, "//a[contains(@href, '/') and @role='link']")

        username = user_elem.get_attribute("href").split("/")[-2]
        usernames.add(username)
        print(f"{i+1:02d} ‚úÖ {username} from {post_url}")

    except WebDriverException as wd_err:
        print(f"{i+1:02d} ‚ùå WebDriver error: {wd_err}")
        print("üîÅ Skipping this post... possibly browser crash or refusal.")
        continue
    except Exception as e:
        print(f"{i+1:02d} ‚ùå Unknown error at {post_url}")
        traceback.print_exc()
        continue

# --- Step 5: Save usernames to file ---
with open("upsc_usernames_raw.txt", "w", encoding="utf-8") as f:
    for u in usernames:
        f.write(f"{u}\n")

print(f"\nüßæ Finished. Saved {len(usernames)} usernames to upsc_usernames_raw.txt")

# --- Optional: Close browser ---
driver.quit()


üîê Please log in manually in the browser. Then come back here and press ENTER.


 


üîÑ Scrolling through #upsc posts...
‚úÖ Collected 231 post links.

üîç Visiting 20 posts to extract usernames...
01 ‚úÖ ssc_cgl_chsl_mts_gd_notes from https://www.instagram.com/p/DIgNLEztECK/
02 ‚úÖ vikash___divyakriti from https://www.instagram.com/p/DIBZnqczZ4M/
03 ‚úÖ _mission_upsc_official from https://www.instagram.com/p/DHlvcYIy9ho/
04 ‚úÖ yadav.sarkar.official from https://www.instagram.com/p/DHqiBHQJvYW/
05 ‚úÖ upsc._.wala from https://www.instagram.com/p/DF67TXizdSi/
06 ‚úÖ iaspublicschool from https://www.instagram.com/p/DH3eN9Pv7F7/
07 ‚úÖ understand.upsc from https://www.instagram.com/p/DIdMLDri4fQ/
08 ‚úÖ ias_mentors from https://www.instagram.com/p/DH3Jz74PaNY/
09 ‚úÖ indianmasterminds from https://www.instagram.com/p/DHsAu3ZyPE5/
10 ‚úÖ upsc_wallah8 from https://www.instagram.com/p/DINf6s3I0Po/
11 ‚úÖ ncert_notes_questions from https://www.instagram.com/p/DHRDGNtvweZ/
12 ‚úÖ onevisionmedia.in from https://www.instagram.com/p/DE1MoQJIo9y/
13 ‚úÖ tireless.study from htt

## Part-1: Bulk Scrapping UPSC Influencers Username

In [9]:
import time
import random
import traceback
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException

# ---------------------
# PARAMETERS
# ---------------------
hashtags = [
    "upsc", "upscmotivation", "upscmemes", "upsccurrentaffairs",
    "ias", "iasdream", "civilservices", "upscprelims", "mains2024"
]
hashtags = [tag.lower() for tag in hashtags]  # Normalize to lowercase

MAX_POSTS_PER_TAG = 120
SCROLLS = 15
intermediate_file = "usernames_partial.txt"
final_output = "upsc_usernames_raw3.txt"

# ---------------------
# BROWSER SETUP
# ---------------------
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=options)

# ---------------------
# MANUAL LOGIN
# ---------------------
driver.get("https://www.instagram.com/accounts/login/")
print("üîê Please log in manually, then press ENTER here.")
input()

usernames_all = set()

# ---------------------
# MAIN SCRAPE LOOP
# ---------------------
for tag in hashtags:
    print(f"\nüìç Exploring #{tag}")
    driver.get(f"https://www.instagram.com/explore/tags/{tag}/")
    time.sleep(random.uniform(5, 8))

    post_links = set()
    for _ in range(SCROLLS):
        links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
        for link in links:
            href = link.get_attribute("href")
            if href:
                post_links.add(href)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2, 4))

    post_links = list(post_links)[:MAX_POSTS_PER_TAG]
    print(f"üîç Collected {len(post_links)} post URLs")

    for i, post_url in enumerate(post_links):
        try:
            driver.get(post_url)
            time.sleep(random.uniform(4, 7))

            username = None
            try:
                # Most reliable: header anchor
                user_elem = driver.find_element(By.XPATH, "//header//a[contains(@href, '/') and not(contains(@href, '/p/'))]")
                username = user_elem.get_attribute("href").split("/")[-2]
            except NoSuchElementException:
                try:
                    user_elem = driver.find_element(By.XPATH, "//a[contains(@href, '/') and @role='link']")
                    username = user_elem.get_attribute("href").split("/")[-2]
                except:
                    pass

            if username and username not in usernames_all:
                usernames_all.add(username)
                print(f"{i+1:03d} ‚úÖ {username}")

                with open(intermediate_file, "a", encoding="utf-8") as f:
                    f.write(username + "\n")
            else:
                print(f"{i+1:03d} ‚ö†Ô∏è Duplicate or missing username")

        except WebDriverException as wd_err:
            print(f"{i+1:03d} ‚ùå WebDriver error: {wd_err}")
            continue
        except Exception:
            print(f"{i+1:03d} ‚ùå Error at {post_url}")
            traceback.print_exc()
            continue

# ---------------------
# SAVE FINAL LIST
# ---------------------
final_usernames = sorted(usernames_all)
with open(final_output, "w", encoding="utf-8") as f:
    for u in final_usernames:
        f.write(u + "\n")

print(f"\nüéØ Done! Collected {len(final_usernames)} unique usernames.")
driver.quit()


üîê Please log in manually, then press ENTER here.


 



üìç Exploring #upsc
üîç Collected 120 post URLs
001 ‚úÖ kumarmuktendra__irs
002 ‚úÖ upscwithira
003 ‚úÖ gk.tutor
004 ‚úÖ indianmasterminds
005 ‚úÖ learnwithsujitdutta
006 ‚úÖ vidyarthika.in
007 ‚úÖ topdailyca
008 ‚úÖ annapurnasingh99
009 ‚úÖ ias_study.in.02
010 ‚úÖ officer_dream_
011 ‚úÖ ncert_notes_questions
012 ‚úÖ upsc._.wala
013 ‚úÖ upsc_wallah8
014 ‚ö†Ô∏è Duplicate or missing username
015 ‚ö†Ô∏è Duplicate or missing username
016 ‚úÖ reetupriya1234
017 ‚úÖ my_upsc_journal
018 ‚úÖ handwritten_upscnotes
019 ‚úÖ nextias
020 ‚úÖ ssc_gd_chsl_cgl_mts_2024
021 ‚ö†Ô∏è Duplicate or missing username
022 ‚úÖ currentaffairsonly_upsc
023 ‚ö†Ô∏è Duplicate or missing username
024 ‚úÖ motivation_5524
025 ‚úÖ upscwith_sagar
026 ‚úÖ tireless.study
027 ‚ö†Ô∏è Duplicate or missing username
028 ‚ö†Ô∏è Duplicate or missing username
029 ‚úÖ ips_babu_amit
030 ‚ö†Ô∏è Duplicate or missing username
031 ‚úÖ rahulvgopal.ips
032 ‚úÖ drishtiiasenglish
033 ‚úÖ itubeclasses_upsc
034 ‚úÖ collectorbabu_
035 ‚úÖ a

InvalidSessionIdException: Message: invalid session id
Stacktrace:
	GetHandleVerifier [0x01118073+60707]
	GetHandleVerifier [0x011180B4+60772]
	(No symbol) [0x00F404FE]
	(No symbol) [0x00F7B898]
	(No symbol) [0x00FACF06]
	(No symbol) [0x00FA89D5]
	(No symbol) [0x00FA7F66]
	(No symbol) [0x00F136E5]
	(No symbol) [0x00F13C3E]
	(No symbol) [0x00F140CD]
	GetHandleVerifier [0x0135BB53+2435075]
	GetHandleVerifier [0x013570F3+2416035]
	GetHandleVerifier [0x0137349C+2531660]
	GetHandleVerifier [0x0112F145+155125]
	GetHandleVerifier [0x01135AED+182173]
	(No symbol) [0x00F133B0]
	(No symbol) [0x00F12BC3]
	GetHandleVerifier [0x0147D23C+3620588]
	BaseThreadInitThunk [0x75955D49+25]
	RtlInitializeExceptionChain [0x76EDCF0B+107]
	RtlGetAppContainerNamedObjectPath [0x76EDCE91+561]


## Part-2:  Verifying UPSC Influencers

In [None]:
import time
import csv
import random
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

# UPSC-relevant keywords
upsc_keywords = [
    "upsc", "ias", "cse", "prelims", "mains", "civil services", "upscmotivation", "upscmemes",
    "answer writing", "syllabus", "gk", "current affairs", "quiz"
]

# Normalize and match keywords (hashtags included)
def is_upsc_caption(text):
    if not text:
        return False
    text_clean = re.sub(r'[^\w\s#]', '', text.lower())
    words = set(text_clean.replace('#', ' #').split())
    keyword_set = set(upsc_keywords + [f"#{k}" for k in upsc_keywords])
    return any(word in keyword_set for word in words)

# Setup Chrome
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, 10)

# Login manually
driver.get("https://www.instagram.com/accounts/login/")
print("üîê Please log in manually. Then press ENTER here.")
input()

# Read usernames
with open("usernames_partial.txt", "r", encoding="utf-8") as f:
    usernames = [line.strip() for line in f if line.strip()]
print(f"üîç Loaded {len(usernames)} usernames")

# Extract followers
def extract_followers():
    try:
        span = driver.find_element(By.XPATH, "//header//ul/li[2]//span")
        title_attr = span.get_attribute("title")
        return parse_follower_string(title_attr or span.text)
    except:
        return None

def parse_follower_string(text):
    try:
        text = text.lower().replace(",", "").replace(" ", "").strip()
        text = text.replace("followers", "")
        if "m" in text:
            return int(float(text.replace("m", "")) * 1_000_000)
        elif "k" in text:
            return int(float(text.replace("k", "")) * 1_000)
        return int(text)
    except:
        return None

# Simulate scroll to load posts
def scroll_to_load_posts(scroll_times=4):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2.5, 3.5))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Extract caption text from full page (post or reel)
def get_fullpage_caption():
    try:
        cap_elem = wait.until(EC.presence_of_element_located((
            By.XPATH, '//div[@data-testid="video-caption"] | //div[@role="presentation"]//ul')))
        return cap_elem.text.strip()
    except:
        return ""

# Main analysis loop
results = []
for i, username in enumerate(usernames):
    profile_url = f"https://www.instagram.com/{username}/"
    print(f"\n{i+1:03d}/{len(usernames)} ‚ûú Visiting: {profile_url}")
    try:
        driver.get(profile_url)
        time.sleep(random.uniform(4, 6))
        scroll_to_load_posts()

        followers = extract_followers()
        print(f"   üëÄ Followers: {followers}")
        if followers is None or not (10000 <= followers <= 1_000_000):
            print("   üö´ Skipping (followers out of range)")
            continue

        # Collect both posts and reels
        post_links = []
        links = driver.find_elements(By.XPATH, "//a[contains(@href, '/p/') or contains(@href, '/reel/')]")
        for elem in links:
            href = elem.get_attribute("href")
            if href and href not in post_links:
                post_links.append(href)
            if len(post_links) >= 10:
                break

        if not post_links:
            print("   üö´ No posts found.")
            continue

        upsc_post_count = 0
        for post_url in post_links:
            print(f"      ‚ûú Analyzing: {post_url}")
            try:
                driver.get(post_url)
                time.sleep(random.uniform(3, 4))

                caption = get_fullpage_caption()
                if is_upsc_caption(caption):
                    upsc_post_count += 1
                    print(f"         ‚úÖ UPSC content found.")
                else:
                    print(f"         ‚ùå No UPSC content.")

            except Exception as e:
                print(f"   ‚ö†Ô∏è Post load error: {post_url} ‚Äì {e}")

        score = upsc_post_count / len(post_links)
        if score >= 0.8:
            print(f"   ‚úÖ Accepted: {username} ({followers} followers, {upsc_post_count}/{len(post_links)} UPSC posts)")
            results.append((username, followers, profile_url))
        else:
            print(f"   ‚ùå Rejected: {username} (only {upsc_post_count}/{len(post_links)} UPSC posts)")

    except Exception as e:
        print(f"   ‚ùå Profile error: {username} ‚Äì {e}")
        continue

# Save results
import pandas as pd

# Save to Excel
df = pd.DataFrame(results, columns=["username", "followers", "profile_url"])
df.to_excel("upsc_verified_accounts.xlsx", index=False)

print(f"\nüéØ DONE! Saved {len(results)} verified UPSC accounts to upsc_verified_accounts.xlsx")
driver.quit()


üîê Please log in manually. Then press ENTER here.


 


üîç Loaded 406 usernames

001/406 ‚ûú Visiting: https://www.instagram.com/upsc_ias.ips_motivesan/
   üëÄ Followers: 17700
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/reel/DIwCYskvwYV/
         ‚ùå No UPSC content.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/p/DIwAWUHv1GH/
         ‚úÖ UPSC content found.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/reel/DItu6LUv48F/
         ‚úÖ UPSC content found.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/reel/DIk1bFtPpsS/
         ‚úÖ UPSC content found.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/p/DIkw-IEPyNM/
         ‚ùå No UPSC content.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/reel/DIbeGHbPT51/
         ‚úÖ UPSC content found.
      ‚ûú Analyzing: https://www.instagram.com/upsc_ias.ips_motivesan/p/DIW2sCYhRWQ/
         ‚ùå No UPSC content.
      ‚ûú Analyzing: https://www.instagram.com/

## Final Code

In [11]:
import time
import csv
import random
import re
from pathlib import Path
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

# UPSC-relevant keywords
upsc_keywords = [
    "upsc", "ias", "cse", "prelims", "mains", "civil services", "upscmotivation", "upscmemes",
    "answer writing", "syllabus", "gk", "current affairs", "quiz"
]

# File paths
partial_file = "usernames_partial.txt"
verified_file = "upsc_verified_accounts.csv"
rejected_file = "rejected_usernames.txt"
skipped_file = "skipped_usernames.txt"

# Keyword matcher (case-insensitive, hashtag aware)
def is_upsc_caption(text):
    if not text:
        return False
    text_clean = re.sub(r'[^\w\s#]', '', text.lower())
    words = set(text_clean.replace('#', ' #').split())
    keyword_set = set(upsc_keywords + [f"#{k}" for k in upsc_keywords])
    return any(word in keyword_set for word in words)

# Selenium setup
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, 10)

# Manual login
driver.get("https://www.instagram.com/accounts/login/")
print("üîê Please log in manually. Then press ENTER here.")
input()

# Load usernames to process
with open(partial_file, "r", encoding="utf-8") as f:
    all_usernames = [line.strip() for line in f if line.strip()]

# Load previously processed usernames
processed = set()
for fname in [verified_file, rejected_file, skipped_file]:
    if Path(fname).exists():
        with open(fname, "r", encoding="utf-8") as f:
            if fname.endswith(".csv"):
                next(f)  # skip header
                processed.update(line.split(",")[0].strip() for line in f if line.strip())
            else:
                processed.update(line.strip() for line in f if line.strip())

print(f"üß† {len(processed)} usernames already processed.")
usernames = [u for u in all_usernames if u not in processed]
print(f"üîç Starting with {len(usernames)} unprocessed usernames...")

# Follower helpers
def extract_followers():
    try:
        span = driver.find_element(By.XPATH, "//header//ul/li[2]//span")
        title_attr = span.get_attribute("title")
        return parse_follower_string(title_attr or span.text)
    except:
        return None

def parse_follower_string(text):
    try:
        text = text.lower().replace(",", "").replace(" ", "").strip().replace("followers", "")
        if "m" in text:
            return int(float(text.replace("m", "")) * 1_000_000)
        elif "k" in text:
            return int(float(text.replace("k", "")) * 1_000)
        return int(text)
    except:
        return None

# Scroll profile to load more posts
def scroll_to_load_posts(scrolls=4):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2.5, 3.5))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Extract caption from post or reel
def get_caption_text():
    try:
        cap_elem = wait.until(EC.presence_of_element_located((
            By.XPATH, '//div[@data-testid="video-caption"] | //div[@role="presentation"]//ul')))
        return cap_elem.text.strip()
    except:
        return ""

# Process each username
for i, username in enumerate(usernames):
    profile_url = f"https://www.instagram.com/{username}/"
    print(f"\n{i+1:03d}/{len(usernames)} ‚ûú Visiting: {profile_url}")
    try:
        driver.get(profile_url)
        time.sleep(random.uniform(4, 6))
        scroll_to_load_posts()

        followers = extract_followers()
        print(f"   üëÄ Followers: {followers}")
        if not followers or not (10000 <= followers <= 1_000_000):
            print("   üö´ Skipping (followers out of range)")
            with open(rejected_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")
            with open(skipped_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")
            continue

        links = driver.find_elements(By.XPATH, "//a[contains(@href, '/p/') or contains(@href, '/reel/')]")
        post_links = []
        for elem in links:
            href = elem.get_attribute("href")
            if href and href not in post_links:
                post_links.append(href)
            if len(post_links) >= 12:
                break

        if not post_links:
            print("   üö´ No posts found.")
            with open(rejected_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")
            with open(skipped_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")
            continue

        upsc_count = 0
        for post_url in post_links:
            print(f"      ‚ûú Analyzing: {post_url}")
            try:
                driver.get(post_url)
                time.sleep(random.uniform(3, 5))
                caption = get_caption_text()
                if is_upsc_caption(caption):
                    upsc_count += 1
                    print("         ‚úÖ UPSC content found")
                else:
                    print("         ‚ùå No UPSC content")
            except:
                print(f"         ‚ö†Ô∏è Error loading post")

        ratio = upsc_count / len(post_links)
        if ratio >= 0.8:
            print(f"   ‚úÖ ACCEPTED: {username} ({followers} followers, {upsc_count}/{len(post_links)} UPSC posts)")
            with open(verified_file, "a", encoding="utf-8", newline="") as f:
                writer = csv.writer(f)
                if Path(verified_file).stat().st_size == 0:
                    writer.writerow(["username", "followers", "profile_url"])
                writer.writerow([username, followers, profile_url])
            with open(partial_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")
        else:
            print(f"   ‚ùå REJECTED: {username} (only {upsc_count}/{len(post_links)} UPSC posts)")
            with open(rejected_file, "a", encoding="utf-8") as f:
                f.write(username + "\n")

        with open(skipped_file, "a", encoding="utf-8") as f:
            f.write(username + "\n")

    except Exception as e:
        print(f"   ‚ùå Error on {username}: {e}")
        continue

driver.quit()
print("\nüéØ All usernames processed successfully.")


üîê Please log in manually. Then press ENTER here.


 


üß† 1 usernames already processed.
üîç Starting with 383 unprocessed usernames...

001/383 ‚ûú Visiting: https://www.instagram.com/upscprepias/
   üëÄ Followers: 29700
      ‚ûú Analyzing: https://www.instagram.com/abhi.why/p/C3MocnLvLd_/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/p/DIqPNo-v5wm/
         ‚ùå No UPSC content
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/reel/DImEUI4yylp/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/p/DIg0IvZP_TY/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/reel/DIgBko7PC42/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/p/DIc0aHdvVdB/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/p/DIbOFh0vVza/
         ‚úÖ UPSC content found
      ‚ûú Analyzing: https://www.instagram.com/upscprepias/reel/DIaMtOnyeLK/
         ‚

## Testing Phase

In [3]:
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
from datetime import datetime
import os

# --- CONFIG ---
chrome_driver_path = r"C:\Users\abhin\chromedriver\chromedriver-win64\chromedriver.exe"  # <<<<<<<< UPDATE THIS
target_profile = "https://www.instagram.com/upsc.ias_gk/"
upsc_keywords = [
    "upsc", "civil service", "ias", "ips", "prelims", "mains", "cse",
    "current affairs", "daily quiz", "ias preparation", "quiz", "gk",
    "general knowledge", "education", "psc", "coaching", "mock test"
]
max_posts_to_check = 15  # Limit for performance
debug_screenshot_path = "debug_profile_page.png"

# --- Launch Browser ---
options = uc.ChromeOptions()
options.add_argument("--start-maximized")
driver = uc.Chrome(driver_executable_path=chrome_driver_path, options=options)

# --- STEP 1: Login ---
driver.get("https://www.instagram.com/")
input("üîê Please log in manually, then press ENTER here.\n")

# --- STEP 2: Go to Profile ---
print(f"\nüéØ Testing profile: {target_profile}")
driver.get(target_profile)
time.sleep(5)

# --- STEP 3: Scroll to Load Posts ---
def scroll_page(pause=2, max_scrolls=5):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(pause, pause + 1))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

scroll_page()

# --- STEP 4: Collect Post Links ---
links = set()
posts = driver.find_elements(By.XPATH, '//a[contains(@href, "/p/") or contains(@href, "/reel/")]')
for post in posts:
    href = post.get_attribute("href")
    if href and ("/p/" in href or "/reel/" in href):
        links.add(href)

post_links = list(links)[:max_posts_to_check]

print(f"üì∏ Found {len(post_links)} post links.")

if not post_links:
    driver.save_screenshot(debug_screenshot_path)
    print(f"\n‚ùå No posts loaded. Open '{debug_screenshot_path}' to investigate.")
    print(f"\n‚ùå No posts to analyze.")
    driver.quit()
    exit()

# --- STEP 5: Analyze Posts ---
upsc_count = 0

for idx, url in enumerate(post_links, 1):
    print(f"\nüîé ({idx}/{len(post_links)}) Visiting: {url}")
    try:
        driver.get(url)
        time.sleep(random.uniform(3, 5))

        # Try to locate caption
        caption = ""
        try:
            # Try common full-reel/post caption container
            caption_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, '//div[@data-testid="video-caption"] | //div[@role="presentation"]//ul'))
            )
            caption = caption_elem.text.lower()
        except:
            print("‚ö†Ô∏è Could not extract caption for this post.")

        if not caption:
            continue

        print(f"   üìù Caption: {caption[:100]}...")

        # Check for UPSC keywords
        found_keywords = [kw for kw in upsc_keywords if kw in caption]
        if found_keywords:
            upsc_count += 1
            print(f"   ‚úÖ UPSC content found: {found_keywords}")
        else:
            print(f"   ‚ùå No UPSC content detected.")

        time.sleep(random.uniform(2, 4))

    except Exception as e:
        print(f"‚ö†Ô∏è Error on post: {url} -> {e}")
        continue

# --- Final Summary ---
print("\n" + "=" * 50)
print(f"\nüìä Checked {len(post_links)} posts.")
print(f"üéØ UPSC-related posts found: {upsc_count}/{len(post_links)}")

if len(post_links) > 0:
    score = upsc_count / len(post_links)
    if score >= 0.8:
        print(f"\n‚úÖ This profile is UPSC-focused ({score:.0%} match)")
    elif score >= 0.5:
        print(f"\nüü° This profile has mixed UPSC content ({score:.0%} match)")
    else:
        print(f"\n‚ùå This profile is not UPSC-focused ({score:.0%} match)")

driver.quit()


üîê Please log in manually, then press ENTER here.
 



üéØ Testing profile: https://www.instagram.com/upsc.ias_gk/
üì∏ Found 15 post links.

üîé (1/15) Visiting: https://www.instagram.com/upsc.ias_gk/reel/DIJg2Stte1b/
   üìù Caption: upsc.ias_gk
‡§∞‡•ã‡§ú‡§æ‡§®‡§æ ‡§ï‡•Å‡§õ ‡§®‡§Ø‡§æ ‡§∏‡§ø‡§ñ‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§´‡•â‡§≤‡•ã ‡§ï‡§∞‡•á‡§Ç ‚ù§

1. ‡§™‡•ã‡§∏‡•ç‡§ü ‡§ï‡•ã ‡§≤‡§æ‡§á‡§ï ‡§ï‡§∞‡•á
2. ‡§π‡§Æ‡§æ‡§∞‡•á ‡§™‡•á‡§ú ‡§ï‡•ã ‡§´‡•â‡§≤‡•ã ‡§ï‡§∞‡•á‡§Ç
...
   ‚úÖ UPSC content found: ['upsc', 'ias', 'ips', 'quiz', 'gk', 'education', 'psc']

üîé (2/15) Visiting: https://www.instagram.com/upsc.ias_gk/reel/DINa3_Vy18A/
   üìù Caption: upsc.ias_gk
‡§∞‡•ã‡§ú‡§æ‡§®‡§æ ‡§ï‡•Å‡§õ ‡§®‡§Ø‡§æ ‡§∏‡•Ä‡§ñ‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡§Æ‡§æ‡§∞‡•á ‡§™‡•á‡§ú ‡§ï‡•ã ‡§´‡•â‡§≤‡•ã ‡§ï‡§∞‡•á‡§Çü•∞

‡§ï‡•Å‡§õ ‡§≤‡•ã‡§ó ‡§ó‡§Ç‡§¶‡•Ä id ‡§ï‡•ã ‡§´‡•â‡§≤‡•ã ‡§ï‡§∞ ‡§≤‡•á‡§§‡•á ‡§π‡•à,...
   ‚úÖ UPSC content found: ['upsc', 'ias', 'ips', 'quiz', 'gk', 'education', 'psc']

üîé (3/15) Visiting: https://www.instagram.com/upsc.ias_gk/reel/DIQm-CBOZgw/
   üìù 

KeyboardInterrupt: 