In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List
from pydantic import BaseModel
import openai
from tqdm import tqdm

openai.api_key = os.getenv("OPENAI_API_KEY")

def serach(query: str) -> List[str]:
    search_url = "https://html.duckduckgo.com/html/"
    params = {
        'q': query
    }
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.post(search_url, data=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for result in soup.find_all('a', {'class': 'result__a'}, href=True):
        links.append(result['href'])
        if len(links) >= 7:
            break
    return links
            
print(serach("yash dagade"))

def extract_text(url: str, max_chars: int = 10000) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred: {e}")
        return ""
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for script in soup(["script", "style"]):
        script.decompose()
        
    text = soup.get_text(separator=" ", strip=True)
    return text[:max_chars]


# print(extract_text("https://cse.umn.edu/me/news/future-me-eden-prairie-high-school-student-wins-state-science-fair"))

def get_text(querey: str, max_chars: int = 10000) -> (str, List[str]):
    urls = serach(querey)
    texts = []
    for url in urls:
        text = extract_text(url, max_chars)
        if text:
            texts.append(text)
        time.sleep(1)  # Respectful delay between requests
    combined_text = "\n\n".join(texts)
    return combined_text, urls


test_example = str(get_text("yash dagade"))




In [2]:
class GPTResponse(BaseModel):
    summary: str
    urls: List[str]
    fun_fact: str
    age: int
    net_worth: int
    
def gpt_request(query: str) -> GPTResponse:
    prompt = f"Based on the following information, fill out a summary, fun fact, age, and net worth of the person: \n\n{query}\n\n Find the follwoing Summary: \nFun fact: \nAge: \nNet worth: \n\n\n Please provide the output in JSON format ensuring that the types are as follows: summary: str     urls: List[str]     fun_fact: str     age: int     net_worth: int " 
    response = openai.beta.chat.completions.parse(
        model = "gpt-4o-mini", 
        messages=[
            {"role": "system", "content": "You are an assistant that structures professional profiles based on provided information. You are to make educated guesses based on the information provided."},
            {"role": "user", "content": prompt}
        ],
        response_format=GPTResponse
    )
    return response.choices[0].message.parsed

yash_information = gpt_request(test_example)

In [3]:
print(f"""
Age: {yash_information.age}
Net Worth: {yash_information.net_worth}
Fun Fact: {yash_information.fun_fact}
Summary: {yash_information.summary}
""")


Age: 17
Net Worth: 0
Fun Fact: Yash Dagade not only won multiple awards at science fairs but is also leading a project aimed at developing a device to curb distracted driving, showing a commitment to both clean energy and public safety.
Summary: Yash Dagade is an innovative 11th-grade student from Eden Prairie High School who recently won the state science fair for his project on high-altitude wind energy. He conducted research in the Plasma Power Propulsion Lab at the University of Minnesota, resulting in a vertical airborne wind turbine project that secured five regional awards and two at the state level. Additionally, Yash is a leading developer of a device to reduce distracted driving, aiming to create accessible technology for safer driving practices. He plans to study mechanical engineering after high school and has ambitions to impact clean energy solutions.



In [4]:
def scrape_person(name: str) -> str:
    model_input = str(get_text(name)) # returns the text and urls for that person
    model_output = gpt_request(model_input) # returns the age, net worth, fun fact, and summary
    
    return f"""
    Age: {model_output.age}
    Net Worth: {model_output.net_worth}
    Fun Fact: {model_output.fun_fact}
    Summary: {model_output.summary}
    """
    
# scrape_person("yash dagade")

In [5]:
print(scrape_person("yash dagade"))


    Age: 16
    Net Worth: 100000
    Fun Fact: Yash is actively involved in multiple engineering projects, including one that aims to harness high-altitude wind power and another that creates a device to help prevent distracted driving, demonstrating his diverse interests in technology and safety.
    Summary: Yash Dagade is an accomplished 11th grade student from Eden Prairie High School focused on mechanical engineering and clean energy solutions. He achieved recognition for his innovative project, a vertical airborne wind farm prototype, which won multiple awards at both regional and state science fairs, allowing him to advance to the International Science and Engineering Fair. He is also actively involved in developing a device to curb distracted driving, showcasing his commitment to safety and technology in the community.
    


In [7]:
print(scrape_person("ayush jain Syntra"))

HTTP error occurred: 403 Client Error: Forbidden for url: https://olhscurrent.org/25185/showcase/one-year-one-percent-chance-two-million-dollars/
HTTP error occurred: 403 Client Error: Forbidden for url: https://pitchbook.com/profiles/company/638174-53

    Age: 25
    Net Worth: 1000000
    Fun Fact: Ayush successfully grew a tutoring program from 0 to 82 schools, which was subsequently acquired by the school board.
    Summary: Ayush Jain is the Co-Founder and CEO of Syntra, a healthcare software company backed by Y Combinator, dedicated to revolutionizing electronic health records (EHR) for private practice doctors by automating administrative tasks and enabling better insights into treatment efficacies. His work focuses on improving healthcare accessibility, especially in aging therapies, and he has been recognized as a TIME Fellow for contributions to the fields of healthcare and AI.
    


In [None]:
print(scrape_person("Brian Mason"))


    Age: 58
    Net Worth: 10000
    Fun Fact: Despite his conviction, Mason had been involved in the community and previously worked in various capacities, though his current situation dramatically contrasts his past roles.
    Summary: Brian Mason, a 58-year-old man, was sentenced to eight years in prison for the reckless homicide of his lifelong friend, Michelle Elliott, who was accidentally shot while he was showing her a gun. The tragic incident occurred on March 26, 2023, and Mason was later found guilty of felony reckless homicide along with charges related to possessing a firearm while under disability due to a previous conviction. His actions, described as lawless and reckless by the presiding judge, have sparked controversy and discussion within the local community.
    


In [39]:
print(scrape_person("Daniel Bao"))

HTTP error occurred: 403 Client Error: Forbidden for url: https://www.researchgate.net/profile/Daniel-Bao

    Age: 26
    Net Worth: 50000
    Fun Fact: Dr. Bao has a keen interest in Clinical Informatics and combines his medical expertise with machine learning skills.
    Summary: Dr. Daniel Bao is a physician specializing in Diagnostic Radiology, practicing in Kingwood, Texas. He graduated from the University of Texas Medical Branch at Galveston in 2023 and currently works at HCA Houston Healthcare Kingwood. Despite being early in his career, he is actively involved in clinical informatics and has a background in machine learning with Python.
    
