In [1]:
# %%
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List, Literal, Optional, Tuple
from pydantic import BaseModel
from openai import OpenAI
from tqdm import tqdm
from enum import Enum
from multiprocessing import Pool
from typing import List, Union
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import threading
from functools import partial
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

# Define Faculty model with additional fields for GPT-generated data
class Faculty(BaseModel):
    research_interests: str
    research_interests_as_commaseperated_list: List[str]
    hobbies: str
    hobbies_as_commaseperated_list: List[str]


In [2]:
# %%
def search(query: str) -> List[str]:
    search_url = "https://html.duckduckgo.com/html/"
    params = {
        'q': query + " MIT Professor hobbies",
    }
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.post(search_url, data=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for result in soup.find_all('a', {'class': 'result__a'}, href=True):
        links.append(result['href'])
        if len(links) >= 8:
            break
    return links

def extract_text_with_timeout(url: str, max_chars: int = 10000, timeout: int = 10) -> Tuple[str, str]:
    def _extract():
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            }
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.status_code == 403:
                return "", url
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator=" ", strip=True)
            return text[:max_chars], url
        except Exception as e:
            return "", url

    try:
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(_extract)
            return future.result(timeout=timeout)
    except TimeoutError:
        print(f"Timeout for {url}")
        return "", url

def get_text(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls



# trying with google for funzies - not needed

def google_search(query: str, num_results: int =8) -> List[str]:
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "q": query + " MIT Professor hobbies",
        "num": num_results,  # Max results (1-10 per request)
    }
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        results = response.json().get("items", [])
        links = [item["link"] for item in results]
        return links
    except Exception as e:
        print(f"Error during Google search for query '{query}': {e}")
        return []

# # Example usage:
# links = google_search("MIT CSAIL")
# print(links)

def get_text_w_google(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = google_search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls


# get_text_w_google("MIT CSAIL")



In [3]:
# %%
def get_hobbies(name: str) -> Faculty:
    base_text, _ = get_text_w_google(name)
    
    # Stronger and more detailed prompt
    prompt = (
        "Based on the following information about the individual, provide a comprehensive and detailed "
        "description of their hobbies and resrach interests with a special focus on hobbies The description should be greater than 200 words, "
        "highlighting the fun and qualitative aspects of the person's life outside of their professional work. "
        "Additionally, provide a comprehensive bullet-point list of their hobbies. Ensure the list is extensive and "
        f"captures all possible hobbies mentioned in the text: \n\n {base_text} \n\n"
    )

    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an assistant that provides detailed and comprehensive descriptions of individuals' hobbies and personal interests based on provided information."},
                {"role": "user", "content": prompt}
            ], 
            response_format=Faculty
        )
        hobbies_data = response.choices[0].message.parsed
    except Exception as e:
        print(f"Error processing hobbies for {name}: {e}")
    
    return hobbies_data


In [4]:
# # %%
# # Load the existing CSV
# df = pd.read_csv('mit_csail_pis.csv')

# # Initialize new columns
# df['gpt_research_interest_paragraph'] = ""
# df['gpt_research_interest_bullet'] = ""
# df['gpt_hobbies'] = ""
# df['gpt_hobbies_bullet'] = ""

# # Iterate over each PI to fetch and append hobbies data
# for index, row in tqdm(df.iterrows(), total=df.__len__(), desc="Processing PIs"):
#     name = row['Principal Investigator']
#     faculty_data = get_hobbies(name)
    
#     df.at[index, 'gpt_hobbies'] = faculty_data.hobbies
#     df.at[index, 'gpt_hobbies_bullet'] = "; ".join(faculty_data.hobbies_as_commaseperated_list)
#     df.at[index, 'gpt_research_interest_paragraph'] = faculty_data.research_interests
#     df.at[index, 'gpt_research_interest_bullet'] = "; ".join(faculty_data.research_interests_as_commaseperated_list)

# # Save the updated CSV
# df.to_csv('mit_csail_pis_enriched.csv', index=False)
# print("Enriched CSV file 'mit_csail_pis_enriched.csv' has been created successfully.")

In [5]:
# run above sell for all PIs - tried to make this fater :D
sample_run = get_hobbies("Alan Edelman")

In [6]:
print(sample_run.hobbies, '\n\n\n', sample_run.hobbies_as_commaseperated_list)

print("\n\n\n resrach intrests \n\n\n")
print(sample_run.research_interests, '\n\n\n', sample_run.research_interests_as_commaseperated_list)

Outside of his rigorous academic pursuits, Alan Edelman has cultivated a rich personal life filled with engaging hobbies that reflect his diverse interests and personality. As a devoted dog owner, he finds joy and relaxation in spending time with his little corgi, whose cheerful presence often features in the class videos for his renowned computational thinking class. This delightful companionship not only adds a playful dimension to his life but also provides a wonderful outlet for outdoor activities, such as walks and playful interactions, allowing him to escape the cerebral demands of academia. Alan also seems to embrace a philosophy of enjoying life to the fullest. He is known to appreciate the lighter side of academic life, often expressing the importance of having fun while working and teaching. This vibrant approach to life suggests that he enjoys engaging in thoughtful discussions or light-hearted banter with colleagues and students, further enriching his social interactions an