In [31]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List
from pydantic import BaseModel
import openai
from tqdm import tqdm

openai.api_key = os.getenv("OPENAI_API_KEY")

def serach(query: str) -> List[str]:
    search_url = "https://html.duckduckgo.com/html/"
    params = {
        'q': query
    }
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.post(search_url, data=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for result in soup.find_all('a', {'class': 'result__a'}, href=True):
        links.append(result['href'])
        if len(links) >= 7:
            break
    return links
            
print(serach("yash dagade"))

def extract_text(url: str, max_chars: int = 10000) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred: {e}")
        return ""
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for script in soup(["script", "style"]):
        script.decompose()
        
    text = soup.get_text(separator=" ", strip=True)
    return text[:max_chars]


# print(extract_text("https://cse.umn.edu/me/news/future-me-eden-prairie-high-school-student-wins-state-science-fair"))

def get_text(querey: str, max_chars: int = 10000) -> (str, List[str]):
    urls = serach(querey)
    texts = []
    for url in urls:
        text = extract_text(url, max_chars)
        if text:
            texts.append(text)
        time.sleep(1)  # Respectful delay between requests
    combined_text = "\n\n".join(texts)
    return combined_text, urls


test_example = str(get_text("yash dagade"))


['https://cse.umn.edu/me/news/future-me-eden-prairie-high-school-student-wins-state-science-fair', 'https://www.yashdagade.com/index.html', 'https://www.linkedin.com/in/yashdagade', 'https://cse.umn.edu/aem/news/future-mechanical-engineering-eden-prairie-high-school-student-wins-state-science-fair', 'https://www.kare11.com/article/news/local/kare11-sunrise/eden-prairie-distraction-free-driving-club-eyedas/89-e3831ebe-e3f2-40cc-8579-432a2615e15c', 'https://www.eplocalnews.org/2024/02/16/eagles-swim-and-dive-outpaced-stma-celebrated-senior-night/', 'https://www.eplocalnews.org/2023/04/10/ephs-junior-one-of-four-finalists-in-science-competition/']


In [32]:
class GPTResponse(BaseModel):
    summary: str
    urls: List[str]
    fun_fact: str
    age: int
    net_worth: int
    
def gpt_request(query: str) -> GPTResponse:
    prompt = f"Based on the following information, fill out a summary, fun fact, age, and net worth of the person: \n\n{query}\n\n Find the follwoing Summary: \nFun fact: \nAge: \nNet worth: \n\n\n Please provide the output in JSON format ensuring that the types are as follows: summary: str     urls: List[str]     fun_fact: str     age: int     net_worth: int " 
    response = openai.beta.chat.completions.parse(
        model = "gpt-4o-mini", 
        messages=[
            {"role": "system", "content": "You are an assistant that structures professional profiles based on provided information. You are to make educated guesses based on the information provided."},
            {"role": "user", "content": prompt}
        ],
        response_format=GPTResponse
    )
    return response.choices[0].message.parsed

yash_information = gpt_request(test_example)

In [33]:
print(f"""
Age: {yash_information.age}
Net Worth: {yash_information.net_worth}
Fun Fact: {yash_information.fun_fact}
Summary: {yash_information.summary}
""")


Age: 17
Net Worth: 0
Fun Fact: In his spare time, Yash has volunteered to restore vintage airplanes.
Summary: Yash Dagade is an accomplished 11th-grade student from Eden Prairie High School who recently won the Minnesota State Science Fair with his innovative project, "WATT from VAWT: Design of A Novel Vertical Airborne Wind Turbine (VAWT) Clean Energy Farm." Through his work in high-altitude wind energy, he has achieved significant recognition, including advancing to the International Science and Engineering Fair. He has also collaborated with graduate students and professors in the field of mechanical engineering, showcasing his talent and dedication to renewable energy solutions.



In [None]:
def scrape_person(name: str) -> str:
    model_input = str(get_text(name)) # returns the text and urls for that person
    model_output = gpt_request(model_input) # returns the age, net worth, fun fact, and summary
    
    return f"""
    Age: {model_output.age}
    Net Worth: {model_output.net_worth}
    Fun Fact: {model_output.fun_fact}
    Summary: {model_output.summary}
    """
    
# scrape_person("yash dagade")

'\n    Age: 17\n    Net Worth: 0\n    Fun Fact: Yash Dagade developed an AI-based device, EyeDAS, to help reduce distracted driving, which he created during his time working with the Distraction Free Driving Club at his high school.\n    Summary: Yash Dagade, an accomplished 11th-grade student from Eden Prairie High School, has demonstrated remarkable talent and initiative in mechanical engineering and clean energy innovations. He has been actively involved in notable projects such as a vertical airborne wind turbine that won multiple awards at state and regional science fairs, and an AI-driven device to reduce distracted driving. Currently preparing for advanced studies in mechanical engineering, he showcased his work at the International Science and Engineering Fair, highlighting his contributions to sustainable energy solutions and vehicle safety technologies. As an incoming student at Duke University, Yash combines his academic pursuits with a passion for building innovative projec

In [None]:
print(scrape_person("pranav ponnosamy"))


    Age: 17
    Net Worth: 0
    Fun Fact: Yash co-invented the EyeDAS device to aid in reducing distracted driving, showcasing his commitment to practical engineering solutions.
    Summary: Yash Dagade is an exemplary student in 11th grade at Eden Prairie High School, known for his innovative research in mechanical engineering and energy. He won the state science fair with his project on a vertical airborne wind turbine, advancing to the International Science and Engineering Fair after earning multiple awards at regional and state levels. Yash is also involved in the Distraction Free Driving Club, where he co-invented the EyeDAS device aimed at reducing distracted driving. He is set to attend Duke University as an AB scholar and aspires to build impactful technologies.
    
