### working attempt

In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

In [None]:
#setting up chrome driver
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

def check_yc_s25_on_linkedin(linkedin_url):
    """Opens LinkedIn page and checks for 'YC S25' mention"""
    if not linkedin_url:
        return 0, None
    try:
        driver.get(linkedin_url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        text = soup.get_text().lower()
        return (1 if "yc s25" in text else 0), text.strip()[:1000]
    except:
        return 0, None

# Step 1: Go to YC directory
driver.get("https://www.ycombinator.com/companies?batch=Summer%202025")
WebDriverWait(driver, 15).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a._company_i9oky_355"))
)

# Step 2: Scroll to the bottom to load all companies
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

companies = driver.find_elements(By.CSS_SELECTOR, "a._company_i9oky_355")
data = []

for company in companies:
    try:
        name = company.find_element(By.CLASS_NAME, "_coName_i9oky_470").text
        description = company.find_element(By.CSS_SELECTOR, "div.text-sm").text
        link = company.get_attribute("href")

        # Open company page in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.get(link)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.prose"))
        )

        full_description = driver.find_element(By.CSS_SELECTOR, "div.prose").text

        # Get website (non-link <div>)
        try:
            website_elem = driver.find_element(By.XPATH, "//div[contains(@class, 'group-hover:underline')]")
            website = website_elem.text.strip()
        except:
            website = None

        # Get LinkedIn profile from icon/button
        try:
            linkedin_elem = driver.find_element(By.XPATH, "//a[contains(@aria-label, 'LinkedIn profile')]")
            linkedin_url = linkedin_elem.get_attribute("href")
        except:
            linkedin_url = None

        yc_s25_flag, linkedin_snippet = check_yc_s25_on_linkedin(linkedin_url)

        data.append({
            "Company Name": name,
            "Short Description": description,
            "Full Description": full_description,
            "YC Page": link,
            "Website": website,
            "LinkedIn URL": linkedin_url,
            "Mentions YC S25": yc_s25_flag,
            "LinkedIn Snippet": linkedin_snippet
        })

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    except Exception as e:
        print(f"Error processing {name}: {e}")
        driver.switch_to.window(driver.window_handles[0])
        continue

# Save results
df_new = pd.DataFrame(data)

if os.path.exists("full_yc_s25_companies_with_linkedin.csv"):
    df_existing = pd.read_csv("full_yc_s25_companies_with_linkedin.csv")
    df_combined = pd.concat([df_existing, df_new]).drop_duplicates(subset="Company Name", keep="last")
else:
    df_combined = df_new

df_combined.to_csv("attempt3_full_yc_s25_companies_with_linkedin.csv", index=False, encoding="utf-8")
driver.quit()
print("✅ Done and saved.")


Error processing Synthetic Society: HTTPConnectionPool(host='localhost', port=37293): Read timed out. (read timeout=120)
✅ Done and saved.


In [24]:
df_combined.head()  # Display the first few rows of the dataframe

Unnamed: 0,Company Name,Short Description,Full Description,YC Page,Website,LinkedIn URL,Mentions YC S25,LinkedIn Snippet
85,Synthetic Society,Synthetic Agents to Simulate Real Users,Synthetic Agents to Simulate Real Users,https://www.ycombinator.com/companies/syntheti...,https://syntheticsociety.ai/,,0,
0,Nox Metals,"The modern factory for fast, low-cost metals.","The modern factory for fast, low-cost metals.",https://www.ycombinator.com/companies/nox-metals,https://noxmetals.co/,https://www.linkedin.com/in/zanehh/,0,linkedin: log in or sign up\n\n\n\n\n\n\n\n\n\...
1,Uplift AI,Foundational Voice AI for underserved languages,Foundational Voice AI for underserved languages,https://www.ycombinator.com/companies/uplift-ai,https://upliftai.org/,https://linkedin.com/in/muhammad-sabir,0,muhammad bin sabir - uplift ai | linkedin\n\n\...
2,RealRoots,RealRoots is a mobile app that guarantees wome...,RealRoots is a mobile app that guarantees wome...,https://www.ycombinator.com/companies/realroots,http://www.therealroots.com,https://linkedin.com/in/tara-kappel-5baa6095,0,sign up | linkedin\n \n\n\n\n\n\n\n\n\n\n...
3,Luminal,Simple ML framework and cloud for generating f...,Simple ML framework and cloud for generating f...,https://www.ycombinator.com/companies/luminal,https://luminalai.com,https://linkedin.com/in/joe-fioti-24a986189,0,sign up | linkedin\n \n\n\n\n\n\n\n\n\n\n...


In [25]:
df_combined = df_combined.drop('LinkedIn Snippet', axis=1)

In [26]:
df_combined.to_csv("clean_attempt3_full_yc_s25_companies_with_linkedin.csv", index=False, encoding="utf-8")

In [29]:
df_combined.head()  # Display the first few rows of the dataframe

Unnamed: 0,Company Name,Short Description,Full Description,YC Page,Website,LinkedIn URL,Mentions YC S25
85,Synthetic Society,Synthetic Agents to Simulate Real Users,Synthetic Agents to Simulate Real Users,https://www.ycombinator.com/companies/syntheti...,https://syntheticsociety.ai/,,0
0,Nox Metals,"The modern factory for fast, low-cost metals.","The modern factory for fast, low-cost metals.",https://www.ycombinator.com/companies/nox-metals,https://noxmetals.co/,https://www.linkedin.com/in/zanehh/,0
1,Uplift AI,Foundational Voice AI for underserved languages,Foundational Voice AI for underserved languages,https://www.ycombinator.com/companies/uplift-ai,https://upliftai.org/,https://linkedin.com/in/muhammad-sabir,0
2,RealRoots,RealRoots is a mobile app that guarantees wome...,RealRoots is a mobile app that guarantees wome...,https://www.ycombinator.com/companies/realroots,http://www.therealroots.com,https://linkedin.com/in/tara-kappel-5baa6095,0
3,Luminal,Simple ML framework and cloud for generating f...,Simple ML framework and cloud for generating f...,https://www.ycombinator.com/companies/luminal,https://luminalai.com,https://linkedin.com/in/joe-fioti-24a986189,0


In [31]:
# Check unique values in LinkedIn URL column
linkedin_url_unique = df_combined['LinkedIn URL'].count()
print("Unique values in LinkedIn URL column:")
print(linkedin_url_unique)

# Check unique values in Mentions YC S25 column
mentions_yc_s25_unique = df_combined['Mentions YC S25'].value_counts()
print("\nUnique values in Mentions YC S25 column:")
print(mentions_yc_s25_unique)

Unique values in LinkedIn URL column:
87

Unique values in Mentions YC S25 column:
Mentions YC S25
0    76
1    12
Name: count, dtype: int64


#### cleaning up

In [5]:
data = pd.read_csv("new_data_parsed.csv", index_col=0)

In [6]:
data.head()

Unnamed: 0,Company Name,Short Description,Full Description,YC Page,Website,LinkedIn URL,Mentions YC S25
0,Nox Metals,"The modern factory for fast, low-cost metals.","The modern factory for fast, low-cost metals.",https://www.ycombinator.com/companies/nox-metals,https://noxmetals.co/,https://www.linkedin.com/in/zanehh/,1
1,Uplift AI,Foundational Voice AI for underserved languages,Foundational Voice AI for underserved languages,https://www.ycombinator.com/companies/uplift-ai,https://upliftai.org/,https://linkedin.com/in/muhammad-sabir,0
2,RealRoots,RealRoots is a mobile app that guarantees wome...,RealRoots is a mobile app that guarantees wome...,https://www.ycombinator.com/companies/realroots,http://www.therealroots.com,https://linkedin.com/in/tara-kappel-5baa6095,0
3,Luminal,Simple ML framework and cloud for generating f...,Simple ML framework and cloud for generating f...,https://www.ycombinator.com/companies/luminal,https://luminalai.com,https://linkedin.com/in/joe-fioti-24a986189,0
4,Outrove,Ultra-realistic AI Recruiter,Ultra-realistic AI Recruiter,https://www.ycombinator.com/companies/outrove,https://www.outrove.ai/,https://www.linkedin.com/in/ahmedelshireef/,0


In [7]:
data = data.drop("Short Description", axis=1)
data.to_csv("clean_new_data.csv", index=False, encoding="utf-8")

In [8]:
data.head()

Unnamed: 0,Company Name,Full Description,YC Page,Website,LinkedIn URL,Mentions YC S25
0,Nox Metals,"The modern factory for fast, low-cost metals.",https://www.ycombinator.com/companies/nox-metals,https://noxmetals.co/,https://www.linkedin.com/in/zanehh/,1
1,Uplift AI,Foundational Voice AI for underserved languages,https://www.ycombinator.com/companies/uplift-ai,https://upliftai.org/,https://linkedin.com/in/muhammad-sabir,0
2,RealRoots,RealRoots is a mobile app that guarantees wome...,https://www.ycombinator.com/companies/realroots,http://www.therealroots.com,https://linkedin.com/in/tara-kappel-5baa6095,0
3,Luminal,Simple ML framework and cloud for generating f...,https://www.ycombinator.com/companies/luminal,https://luminalai.com,https://linkedin.com/in/joe-fioti-24a986189,0
4,Outrove,Ultra-realistic AI Recruiter,https://www.ycombinator.com/companies/outrove,https://www.outrove.ai/,https://www.linkedin.com/in/ahmedelshireef/,0
