In [None]:
# Run this

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from typing import List
from pydantic import BaseModel
import openai
from tqdm import tqdm

openai.api_key = os.getenv("OPENAI_API_KEY")

def serach(query: str) -> List[str]:
    search_url = "https://html.duckduckgo.com/html/"
    params = {
        'q': query
    }
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.post(search_url, data=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error during DuckDuckGo search for query '{query}': {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for result in soup.find_all('a', {'class': 'result__a'}, href=True):
        links.append(result['href'])
        if len(links) >= 7:
            break
    return links
            
# print(serach("yash dagade"))

def extract_text(url: str, max_chars: int = 10000) -> str:
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        # print(f"HTTP error occurred: {e}")
        return ""
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    for script in soup(["script", "style"]):
        script.decompose()
        
    text = soup.get_text(separator=" ", strip=True)
    return text[:max_chars]


# print(extract_text("https://cse.umn.edu/me/news/future-me-eden-prairie-high-school-student-wins-state-science-fair"))

def get_text(querey: str, max_chars: int = 10000) -> (str, List[str]):
    urls = serach(querey)
    texts = []
    for url in urls:
        text = extract_text(url, max_chars)
        if text:
            texts.append(text)
        time.sleep(1)  # Respectful delay between requests
    combined_text = "\n\n".join(texts)
    return str(combined_text + "\n\n" + "\n".join(urls))


test_example = get_text("yash dagade")

print(test_example)


The Future of ME: Eden Prairie High School Student Wins State Science Fair | Mechanical Engineering | College of Science and Engineering Skip to main content Go to the U of M home page One Stop MyU : For Students, Faculty, and Staff Search College of Science and Engineering Mechanical Engineering Mental Health Advocates Academics Undergraduate Graduate Research Impact Areas Technical Areas Biosystems & Bioengineering Combustion & Engines Fluid Mechanics Fluid Power Materials & Mechanics Particle Technology Plasmas Sensing & Controls Thermodynamics & Heat Transfer Undergraduate Research People Faculty Researchers & Postdocs Admin & Operations Staff Lab Directory Connect News Events Events Calendar ME3MT Alumni Contact Faculty Openings Diversity, Equity, & Inclusion Mission and Resources ME Ambassadors Program Breadcrumb Home The Latest From Mechanical Engineering The Future of ME: Eden Prairie High School Student Wins State Science Fair The Future of ME: Eden Prairie High School Student

In [35]:
class StartupMemo(BaseModel):
    year_founded: str
    location: str
    industry: str
    number_of_employees: str
    valuation_range: str
    business_model_summary: str
    executive_summary_paragraph: str
    management_team_paragraph: str
    founding_story_paragraph: str
    problem_statement_paragraph: str
    products_and_services: str
    target_market: str
    recent_traction_milestones_paragraph: str
    noteworthy_details: str
    urls: List[str]

def gpt_request(search_query: str) -> StartupMemo:

    prompt = f"""
    Based on the following information, please provide a comprehensive overview of the startup by filling out the details below:

    {search_query}

    Requirements:
    - Year Founded: Provide the year the startup was founded or your best estimate.
    - Location: Specify the location of the startup.
    - Industry: Mention the industry the startup operates in.
    - Number of Employees: Provide the number of employees as a specific number or a range if exact numbers are unavailable.
    - Valuation Range: Estimate the startup's valuation as a range if precise figures are not available.
    - Business Model Summary: Give a brief description (one or two words) of the business model.
    - Executive Summary: Write a detailed paragraph summarizing the startup's mission and operations.
    - Management Team: Provide a detailed paragraph about the management team.
    - Founding Story: Narrate a paragraph about how the startup was founded.
    - Problem Statement: Describe the problem the startup aims to solve in a paragraph.
    - Products and Services: List the main products and services in less than five words.
    - Target Market: Specify the target market in less than five words.
    - Recent Traction and Milestones: Detail recent achievements or milestones in a paragraph.
    - Noteworthy Details: Include any additional noteworthy details or standout features.
    - URLs: Provide a list of relevant URLs as strings.

    Please provide the output in the following JSON format, ensuring correct data types:

    {{
      "year_founded": "str",
      "location": "str",
      "industry": "str",
      "number_of_employees": "str",
      "valuation_range": "str",
      "business_model_summary": "str",
      "executive_summary_paragraph": "str",
      "management_team_paragraph": "str",
      "founding_story_paragraph": "str",
      "problem_statement_paragraph": "str",
      "products_and_services": "str",
      "target_market": "str",
      "recent_traction_milestones_paragraph": "str",
      "noteworthy_details": "str",
      "urls": ["str"]
    }}
    """


    response = openai.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that structures comprehensive startup profiles based on provided information. Use the best estimates where exact data is unavailable."
            },
            {
                "role": "user",
                "content": prompt
            }
        ], 
        response_format=StartupMemo
    )
    
    return response.choices[0].message.parsed


# company_name = "Alivecor"
# search_data = get_text(company_name)
# print(gpt_request(search_data).products_and_services)

# Example usage:
# search_data = "Aggregated information from search results and URLs about the startup."
# startup_info = gpt_request(search_data)
# print(startup_info)

In [36]:
csv_file_path = "startup_memo.csv"
df = pd.read_csv(csv_file_path)

fields_to_fill = [
    "yr founded",
    "location",
    "industry",
    "size",
    "valuation (est.)",
    "business model",
    "executive summary",
    "management team",
    "founding story",
    "problem",
    "products/services",
    "market",
    "traction",
    "noteworthy details"
]

for field in fields_to_fill:
    if field not in df.columns:
        df[field] = ""
        
df.head()

def process_company(row):
    company_name = row["company"]
    
    search_data = get_text(company_name + "company")
    
    startup_info = gpt_request(search_data)
    
    row["yr founded"] = startup_info.year_founded
    row["location"] = startup_info.location
    row["industry"] = startup_info.industry
    row["size"] = startup_info.number_of_employees
    row["valuation (est.)"] = startup_info.valuation_range
    row["business model"] = startup_info.business_model_summary
    row["executive summary"] = startup_info.executive_summary_paragraph
    row["management team"] = startup_info.management_team_paragraph
    row["founding story"] = startup_info.founding_story_paragraph
    row["problem"] = startup_info.problem_statement_paragraph
    row["products/services"] = startup_info.products_and_services
    row["market"] = startup_info.target_market
    row["traction"] = startup_info.recent_traction_milestones_paragraph
    row["noteworthy details"] = startup_info.noteworthy_details
    row["urls"] = startup_info.urls
    
    return row



In [21]:
total_rows = df.shape[0]

with tqdm(total=total_rows) as pbar:
    for index, row in df.iterrows():
        if pd.isna(row["yr founded"]) or row["yr founded"] == "":
            df.loc[index] = process_company(row)
        else:
            print(f"Skipping company '{row['company']}' as data already exists.")
        pbar.update(1)
        
        if index % 10 == 0 and index!=0:
            df.to_csv("updated_startup_memo.csv", index=False)

df.to_csv("updated_startup_memo.csv", index=False)

  1%|          | 1/160 [00:20<53:28, 20.18s/it]

Failed to retrieve information for company 'Unravel Data'. Skipping.


  1%|          | 1/160 [00:30<1:20:50, 30.51s/it]


KeyboardInterrupt: 