In [3]:
import csv

def parse_pis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file if line.strip()]  # Remove empty lines

    pis = []
    # Split lines into blocks at 'PI'
    blocks = []
    current_block = []
    for line in lines:
        if line == 'PI':
            if current_block:
                blocks.append(current_block)
                current_block = []
        else:
            current_block.append(line)
    if current_block:
        blocks.append(current_block)

    # Define identifiers to skip (optional)
    identifiers = {'Core/Dual', 'Dual/Core', 'Core', 'Dual'}

    for block in blocks:
        pi = {
            'Principal Investigator': '',
            'Email': '',
            'Phone': '',
            'Room No': '',
            'Research Interests': ''
        }
        i = 0
        # Check for identifier
        if i < len(block) and block[i] in identifiers:
            i += 1
        # Name
        if i < len(block):
            pi['Principal Investigator'] = block[i]
            i += 1
        # Position (e.g., Professor, Associate Professor, etc.) - skipped or stored if needed
        if i < len(block):
            position = block[i]
            i += 1
        # Name repeated - skipped
        if i < len(block):
            repeated_name = block[i]
            i += 1
        # Parse contact info
        while i < len(block) and block[i] in {'e', 'p', 'r'}:
            key = block[i]
            i += 1
            if i < len(block):
                value = block[i]
                if key == 'e':
                    pi['Email'] = value
                elif key == 'p':
                    pi['Phone'] = value
                elif key == 'r':
                    pi['Room No'] = value
                i += 1
        # Parse Research Interests if available
        if i < len(block):
            # Ensure that the next line is not a new 'PI'
            if block[i] != 'PI':
                pi['Research Interests'] = block[i]
                i += 1
        pis.append(pi)

    return pis

def write_csv(pis, output_file):
    fieldnames = ['Principal Investigator', 'Email', 'Phone', 'Room No', 'Research Interests', 'Hobbies']
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for pi in pis:
            writer.writerow(pi)

if __name__ == "__main__":
    input_file = 'mit_csail_pis.txt'  # Replace with your actual file path
    output_csv = 'mit_csail_pis.csv'
    pis = parse_pis(input_file)
    write_csv(pis, output_csv)
    print(f"CSV file '{output_csv}' has been created successfully with {len(pis)} entries.")


CSV file 'mit_csail_pis.csv' has been created successfully with 131 entries.


In [4]:
import pandas as pd

df = pd.read_csv('mit_csail_pis.csv')

print(df.__len__())

df.head()

131


Unnamed: 0,Principal Investigator,Email,Phone,Room No,Research Interests,Hobbies
0,Hal Abelson,hal@mit.edu,253-5856,32-G516,"Programming, privacy, App Inventor",
1,Ted Adelson,adelson@csail.mit.edu,253-0645,32-310,"Vision, touch sensing, robotics",
2,Anant Agarwal,agarwal@edx.org,253-1448,,"Systems, architecture, online learning, edX",
3,Pulkit Agrawal,PULKITAG@MIT.EDU,,,,
4,Mohammad Alizadeh,alizadeh@csail.mit.edu,,32-G920,,


In [5]:
# Great his works

import requests
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List, Literal, Optional, Tuple
from pydantic import BaseModel
from openai import OpenAI
from tqdm import tqdm
from enum import Enum
from multiprocessing import Pool
from typing import List, Union
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import threading
from functools import partial
from dotenv import load_dotenv


load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

def search(query: str) -> List[str]:
    search_url = "https://html.duckduckgo.com/html/"
    params = {
        'q': query + "hobbies"
    }
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.post(search_url, data=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for result in soup.find_all('a', {'class': 'result__a'}, href=True):
        links.append(result['href'])
        if len(links) >= 8:
            break
    return links

def extract_text_with_timeout(url: str, max_chars: int = 10000, timeout: int = 10) -> Tuple[str, str]:
    def _extract():
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            }
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.status_code == 403:
                return "", url
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator=" ", strip=True)
            return text[:max_chars], url
        except Exception as e:
            return "", url

    try:
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(_extract)
            return future.result(timeout=timeout)
    except TimeoutError:
        print(f"Timeout for {url}")
        return "", url

def get_text(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls


# get_text("MIT CSAIL")

In [6]:
# trying with google for funzies - not needed

def google_search(query: str, num_results: int =8) -> List[str]:
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "q": query + "hobbies",
        "num": num_results,  # Max results (1-10 per request)
    }
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        results = response.json().get("items", [])
        links = [item["link"] for item in results]
        return links
    except Exception as e:
        print(f"Error during Google search for query '{query}': {e}")
        return []

# # Example usage:
# links = google_search("MIT CSAIL")
# print(links)

def get_text_w_google(query: str, max_chars: int = 10000) -> Tuple[str, List[str]]:
    urls = google_search(query)
    valid_texts = []
    valid_urls = []
    
    for url in urls:
        text, url = extract_text_with_timeout(url)
        if text.strip():
            valid_texts.append(text)
            valid_urls.append(url)
        # time.sleep(0.5)  # Respectful delay
    
    if not valid_texts:
        return "No valid information found.", []
        
    return "\n\n".join(valid_texts), valid_urls


# get_text_w_google("MIT CSAIL")


In [None]:
# Okay so now we just google serach for the PI's name followed by social intrests I guess - let me play with this a bit - we can do their name followed by (something something - could be otpimized later for now I am thinking of just Name + Hobbies)

# we will than feed that into GPT-40 mini or gemini flash 2 (whichever is faster) and try to summarize the person's social aspects and write a small bio about thier social aspects


names = df['Principal Investigator'].tolist()

# Faculty type
class Faculty(BaseModel):
    resrach_interests: str
    resserach_interests_as_commaseperated_list: List[str]
    hobbies: str
    hobbies_as_commaseperated_list: List[str]


def get_hobbies(name: str) -> str:
    base_text, _ = get_text(name)
    hobby_text = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "In a few sentences, describe the hobbies of this person."},
            {"role": "user", "content": base_text}
        ], 
        response_format=Faculty
    ).choices[0].message.content
    
    return hobby_text

get_hobbies("Ted Adelson")


# I can prompot engineer this further and make this much better - let's do that actually! that's the differecence between good and great - okay let me do that now

'{"resrach_interests":"Professor Ted Adelson\'s research interests include the development of advanced tactile sensors for robotic manipulation, exploring the intersection of human and machine vision, and creating technologies that enable robots to use touch similarly to humans. He aims to integrate visual and tactile information to enhance robotics capabilities in various applications, from healthcare to manufacturing.","resserach_interests_as_commaseperated_list":["Tactile sensors for robotics","Human vision","Machine vision","Robotic manipulation","AI and deep learning in robotics"],"hobbies":"In his spare time, Ted enjoys exploring outdoor activities like hiking, as well as engaging in creative projects that blend technology with art.","hobbies_as_commaseperated_list":["Hiking","Creative tech projects","Exploring nature","Photography"]}'