In [37]:
# %%
%pip install webdriver-manager
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List, Literal, Optional, Tuple
from pydantic import BaseModel
from openai import OpenAI
from tqdm import tqdm
from enum import Enum
from multiprocessing import Pool
from typing import List, Union
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import threading
from functools import partial
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

# Define Faculty model with additional fields for GPT-generated data
class Faculty(BaseModel):
    research_interests_paragraph: str
    research_interests_as_commaseperated_list: List[str]
    hobbies: str
    hobbies_as_commaseperated_list: List[str]



Note: you may need to restart the kernel to use updated packages.


In [38]:
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from typing import List
import time

def search(query: str) -> List[str]:
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Enable headless mode
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.get("https://html.duckduckgo.com/html/")
        
        # Find and fill search box
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "q"))
        )
        search_box.send_keys(query)
        search_box.submit()
        
        # Wait for results and extract links
        time.sleep(2)  # Allow time for results to load
        results = driver.find_elements(By.CSS_SELECTOR, "a.result__a")
        
        links = []
        for result in results[:8]:  # Get first 8 results
            links.append(result.get_attribute('href'))
            
        return links
        
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []
        
    finally:
        if 'driver' in locals():
            driver.quit()

if __name__ == "__main__":
    # Test the function
    results = search("Lydia liu princeton cs")
    print(results)

['https://www.lydiatliu.com/', 'https://www.cs.princeton.edu/people/profile/ltliu', 'https://csweb-prod-old.cs.princeton.edu/people/profile/ltliu', 'https://www.cs.princeton.edu/news/lydia-liu-expert-social-impacts-machine-learning-has-joined-faculty', 'https://citp.princeton.edu/citp-people/lydia-liu/', 'https://scholar.google.com/citations?user=IQ2eTA8AAAAJ', 'https://gradfutures.princeton.edu/grad-stories/lydia-t-liu', 'https://www.lydiatliu.com/prospective']


In [39]:
# %%
# def search(query: str) -> List[str]:
#     search_url = "https://html.duckduckgo.com/html/"
#     params = {
#         'q': query ,
#     }
#     headers = {
#         'User-Agent': 'Mozilla/5.0'
#     }
#     try:
#         response = requests.post(search_url, data=params, headers=headers, timeout=10)
#         response.raise_for_status()
#     except Exception as e:
#         print(f"Error during DuckDuckGo search for query '{query}': {e}")
#         return []
    
#     print(response.text)

#     soup = BeautifulSoup(response.text, 'html.parser')
#     links = []
#     for result in soup.find_all('a', {'class': 'result__a'}, href=True):
#         links.append(result['href'])
#         if len(links) >= 8:
#             break
#     print(links)
#     return links

def extract_text_with_timeout(url: str, max_chars: int = 10000, timeout: int = 10) -> Tuple[str, str]:
    def _extract():
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            }
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.status_code == 403:
                return "", url
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator=" ", strip=True)
            return text[:max_chars], url
        except Exception as e:
            return "", url

    try:
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(_extract)
            return future.result(timeout=timeout)
    except TimeoutError:
        print(f"Timeout for {url}")
        return "", url

def get_text(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls



# trying with google for funzies - not needed

def google_search(query: str, num_results: int =8) -> List[str]:
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "q": query,
        "num": num_results,  # Max results (1-10 per request)
    }
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        results = response.json().get("items", [])
        links = [item["link"] for item in results]
        return links
    except Exception as e:
        print(f"Error during Google search for query '{query}': {e}")
        return []




# Example usage:
links = search("tigers")
print(links)

def get_text_w_google(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = google_search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls


# get_text_w_google("MIT CSAIL")



['https://www.freep.com/story/sports/mlb/tigers/2024/12/28/detroit-tigers-alex-bregman-mlb-free-agency/77247901007/', 'https://www.detroitnews.com/story/sports/mlb/tigers/2024/12/27/tigers-reach-agreement-with-free-agent-infielder-gleyber-torres/77257853007/', 'https://www.mlb.com/news/gleyber-torres-contract-with-tigers', 'https://www.mlb.com/tigers/news', 'https://www.sportingnews.com/us/mlb/detroit-tigers', 'https://apnews.com/article/tigers-gleyber-torres-yankees-2281de1383f52831a19d37191a49a2db', 'https://en.wikipedia.org/wiki/Tiger', 'https://www.mlb.com/news/tigers-earn-comeback-win-in-series-opener-vs-royals']


In [40]:
# personal_website = client.chat.completions.create(
#     model="gpt-4o",
#     messages=[
#         {"role": "system", "content": "You are an helpful assitant designed to be factually accurate"},
#         {"role": "user", "content": "can you give me Princeton Professor Lydia Liu's non-academic intrests"},
#     ]
# ).choices[0].message.content

# print(personal_website)

In [41]:
# %%
def get_hobbies(name: str) -> Faculty:
    base_text, _ = get_text_w_google(name)
    
    # Stronger and more detailed prompt
    prompt = (
        "Based on the following information about the individual, provide a comprehensive and detailed "
        "description of their hobbies and research interests with a special focus on hobbies The description should be greater than 200 words, "
        "highlighting the fun and qualitative aspects of the person's life outside of their professional work. "
        "If there are multiple people with the same name, please only consider the individual who is a professor at the named university"
        "Additionally, provide a comprehensive bullet-point list of their non-academic interests. Ensure the list"
        f"captures in detail the main hobbies mentioned in the text: \n\n {base_text} \n\n"
    )

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an assistant that provides detailed and comprehensive descriptions of individuals' hobbies and personal interests based on provided information."},
                {"role": "user", "content": prompt}
            ], 
            response_format=Faculty
        )
        hobbies_data = response.choices[0].message.parsed
    except Exception as e:
        print(f"Error processing hobbies for {name}: {e}")
    
    return hobbies_data


In [42]:
# # %%
# # Load the existing CSV
# df = pd.read_csv('mit_csail_pis.csv')

# # Initialize new columns
# df['gpt_research_interest_paragraph'] = ""
# df['gpt_research_interest_bullet'] = ""
# df['gpt_hobbies'] = ""
# df['gpt_hobbies_bullet'] = ""

# # Iterate over each PI to fetch and append hobbies data
# for index, row in tqdm(df.iterrows(), total=df.__len__(), desc="Processing PIs"):
#     name = row['Principal Investigator']
#     faculty_data = get_hobbies(name)
    
#     df.at[index, 'gpt_hobbies'] = faculty_data.hobbies
#     df.at[index, 'gpt_hobbies_bullet'] = "; ".join(faculty_data.hobbies_as_commaseperated_list)
#     df.at[index, 'gpt_research_interest_paragraph'] = faculty_data.research_interests
#     df.at[index, 'gpt_research_interest_bullet'] = "; ".join(faculty_data.research_interests_as_commaseperated_list)

# # Save the updated CSV
# df.to_csv('mit_csail_pis_enriched.csv', index=False)
# print("Enriched CSV file 'mit_csail_pis_enriched.csv' has been created successfully.")

In [43]:
# run above sell for all PIs - tried to make this fater :D
sample_run = get_hobbies("ariel procaccia harvard professor")

In [44]:
print(sample_run.hobbies, '\n\n\n', sample_run.hobbies_as_commaseperated_list)

print("\n\n\n research interests \n\n\n")
print(sample_run.research_interests_paragraph, '\n\n\n' ,sample_run.research_interests_as_commaseperated_list)

Outside of his professional commitments, Ariel Procaccia enjoys a rich tapestry of hobbies that enrich his life beyond academia. He has a deep appreciation for philosophy, often engaging in discussions surrounding ethical implications of technology and governance. Ariel is also an avid reader, with a particular fondness for science fiction and philosophical texts, which stimulate his imagination and intellectual curiosity. He finds joy in spending time with his family, often involving outdoor activities, which provide a balance to his rigorous academic schedule. To further connect with broader societal issues, Ariel participates in community service projects, particularly those aimed at supporting refugees and homeless initiatives, reflecting his belief in the power of technology to transform lives. These hobbies not only enhance his personal well-being but also inform his professional work, as they encourage him to remain grounded in the practical and ethical considerations of technol