In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
from crewai import Agent, Task, Crew

2025-10-07 21:19:13,004 - 13389295616 - telemetry.py-telemetry:71 - ERROR: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
2025-10-07 21:19:58,207 - 13389295616 - telemetry.py-telemetry:71 - ERROR: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
2025-10-07 21:20:33,389 - 13389295616 - telemetry.py-telemetry:71 - ERROR: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
2025-10-07 21:21:48,730 - 13389295616 - telemetry.py-telemetry:71 - ERROR: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


In [3]:
import os
import importlib
import utils
importlib.reload(utils)
from utils import get_openai_api_key, get_serper_api_key, get_openai_model_name, pretty_print_result

openai_api_key = get_openai_api_key()
os.environ["OPENAI_MODEL_NAME"] = get_openai_model_name()
os.environ["SERPER_API_KEY"] = get_serper_api_key()

In [4]:
from crewai_tools import SerperDevTool, \
                         ScrapeWebsiteTool, \
                         WebsiteSearchTool

In [5]:
# Import voice functions
from utils import record_audio_input, transcribe_audio, text_to_speech, play_and_save_audio


In [6]:
search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

In [7]:
# -----------------------------
# Config - tweak to taste
# -----------------------------
TOP_N_OVERALL = 50  # set to 10 if you want a top-10 instead
#MIN_SIGNAL_SOURCES = 2  # require at least this many sources per company
DEFAULT_REGION = "{region}"  # e.g., "US & Europe" - can be overridden at runtime
DEFAULT_TIME_WINDOW = "{time_window}"  # e.g., "last 24 months" - can be overridden
INDUSTRY = "{industry}"  # required input

CLASSIFICATION_RUBRIC = """
Stage labels:
- Big Tech: public, multi-product tech platform or incumbent with either ≥10k employees or ≥10B market cap or clear category dominance.
- Late-stage startup: Series D+ or valuation ≥1B, or ≥500 employees, or estimated revenue ≥100M.
- Mid-stage startup: Series B–C, 100–499 employees, valuation ~100M–1B, or revenue ~10–100M.
- Early-stage startup: Pre-seed–Series A, <100 employees, or revenue <10M.
Signal priority when conflict appears: funding stage, then employee count, then valuation, then revenue. Always return a confidence score.
Hard caps: output is a single ranked list of at most TOP_N_OVERALL companies overall. Do not exceed the cap.
"""

OUTPUT_SCHEMA_GUIDE = """
Provide a human-friendly Markdown table, where companies are sorted from the lowest rank to the highest.
"""

#OUTPUT_SCHEMA_GUIDE = """
#Final JSON array item shape:
#{{
#  "rank": <int 1..TOP_N_OVERALL>,
#  "company": "<marketed company name>",
#  "sub_industry": "<one of the mapped sub-industries>",
#  "stage_label": "<Big Tech | Late-stage startup | Mid-stage startup | Early-stage startup>"
#}}
#Provide the final JSON and also a short, human-friendly Markdown table with the same rows.
#"""

#OUTPUT_SCHEMA_GUIDE = """
#Final JSON array item shape:
#{{
#  "rank": <int 1..TOP_N_OVERALL>,
#  "company": "<canonical company name>",
#  "sub_industry": "<one of the mapped sub-industries>",
#  "stage_label": "<Big Tech | Late-stage startup | Mid-stage startup | Early-stage startup>",
#  "why_it_matters": "<one crisp sentence>",
#  "key_signals": {{
#    "funding_stage_or_round": "<text or null>",
#    "employees": "<int or null>",
#    "valuation_or_revenue": "<text or null>",
#    "notable_products_or_share": "<text or null>"
#  }},
#  "confidence": "<0.0-1.0>",
#  "sources": ["<url1>", "<url2>", "..."]  # at least MIN_SIGNAL_SOURCES items
#}}
#Provide the final JSON and also a short, human-friendly Markdown table with the same rows.
#"""

In [8]:
industry_taxonomist = Agent(
    role="Industry Taxonomist",
    goal=(
        "Turn the input industry into a crisp, research-ready scope with a clean sub-industry map, "
        "inclusion-exclusion rules, synonyms, and search guidance so downstream research stays focused."
    ),
    backstory=(
        "You are a former strategy consultant turned data taxonomist. You dislike vague scopes and "
        "imprecise buckets. You write tight, unambiguous definitions and choose pragmatic sub-industries "
        "that reflect how the market actually organizes itself. You capture common aliases so search is robust. "
        "You set guardrails for region and time window and explicitly note what is out of scope."
    ),
)

In [9]:
research_analyst = Agent(
    role="Research Analyst",
    goal=(
        "Find and enrich the most important companies in the mapped sub-industries using credible, current sources, "
        "then assemble a clean candidate table with signals needed for stage classification and ranking."
    ),
    backstory=(
        "You are a meticulous OSINT-oriented analyst. You prefer primary and reputable sources: public filings, "
        "S-1s, investor reports, funding databases, company pages, trusted tech media, and recent industry maps. "
        "You reconcile conflicting facts and always keep source URLs. You avoid fluff and discard low-credibility sources."
    ),
)

In [10]:
classifier_ranker = Agent(
    role="Stage Classifier and Ranker",
    goal=(
        "Apply the rubric precisely, assign stage labels with confidence scores, rank companies across the whole industry, "
        "enforce the hard cap of " + str(TOP_N_OVERALL) + ", and produce the final JSON plus a compact Markdown table."
    ),
    backstory=(
        "You are a former VC analyst and product manager. You are pragmatic about imperfect data, explain your choices, "
        "and keep results scannable. You do light QA: dedupe entities, fix parent vs product mixups, check that each row "
        "has enough sources, and ensure no more than " + str(TOP_N_OVERALL) +" items make it to the final list."
    ),
)

In [11]:
map_subindustries = Task(
    description=(
        "Build a practical sub-industry map for {industry}. "
        "Output a brief scope note and a list of 6-12 sub-industries max. "
        "For each sub-industry, include: 1-line definition, common aliases, and inclusion-exclusion notes.\n\n"
        f"Region default: {DEFAULT_REGION}\n"
        f"Time window: {DEFAULT_TIME_WINDOW}\n\n"
        "Rules:\n"
        "- Use real-market groupings that practitioners recognize.\n"
        "- Keep names short and unambiguous.\n"
        "- Note key overlaps and what to exclude to avoid double counting.\n"
        "- This map will drive research and tagging for the final list.\n"
        "- Do not list any companies yet.\n"
        "Tooling note: When using the Serper search tool, pass a plain string to search_query (not a dict). Example: 'fintech sub-industry map US last 12 months'.\n"
    ),
    expected_output=(
        #"A JSON object with:\n"
        #"{\n"
        #'  "industry": "<input>",\n'
        #'  "scope_note": "<2-4 sentences>",\n'
        #'  "sub_industries": [\n'
        #'    {"name": "<name>", "definition": "<one line>", "aliases": ["a","b"], "include": "<short>", "exclude": "<short>"}\n'
        #"  ]\n"
        #"}\n"
        #"Plus a short Markdown list rendering the sub-industries."
        "Deliver as a compact Markdown table for a quick skim."
    ),
    tools=[search_tool, scrape_tool],
    agent=industry_taxonomist,
)

In [12]:
mine_companies = Task(
    description=(
        "Using the sub-industry map produced earlier, research the most important companies in each sub-industry of {industry}. "
        "Collect signals required for stage classification and ranking. Bias toward current scale and impact in the specified region and time window.\n\n"
        "What to capture per company:\n"
        "- Canonical name and homepage URL\n"
        "- Sub-industry tag from the map\n"
        "- One-line description and notable products\n"
        "- Funding stage or latest round, employees, valuation or revenue if available\n"
        #"- At least {min_sources} credible sources with URLs and last-updated dates\n\n"
        "Quality rules:\n"
        "- Prefer primary sources and recent data. Avoid low-credibility blogs.\n"
        "- Reconcile conflicting facts. Note uncertainty briefly if needed.\n"
        "- Remove duplicates and product-level entries if a parent company is the actual entity.\n"
        "- It is fine to collect more than {top_n} candidates at this stage, but keep it tight and relevant.\n"
        "- Make sure to include all relevant companies in the search query and the final list."
        "Tooling note: When using the Serper search tool, pass a plain string to search_query (not a dict). Example: 'fintech sub-industry map US last 12 months'.\n"
    ).format(industry="{industry}", top_n=TOP_N_OVERALL),
    expected_output=(
        #"A JSON array named `candidates` where each item includes:\n"
        #"{\n"
        #'  "company": "<name>", "url": "<homepage>", "sub_industry": "<from map>",\n'
        #'  "description": "<one line>",\n'
        #'  "signals": {"funding_stage_or_round":"<text>", "employees":"<int or null>", "valuation_or_revenue":"<text or null>", "notable_products_or_share":"<text or null>"},\n'
        #'  "sources": [{"url":"<url>", "last_updated":"<YYYY-MM or YYYY-MM-DD>", "why_trustworthy":"<short>"}]\n'
        #"}\n"
        #"Deliver as JSON plus a compact Markdown table for a quick skim."
        "Deliver as a compact Markdown table for a quick skim."
    ),
    agent=research_analyst,
    tools=[search_tool, scrape_tool],
)

In [13]:
classify_and_rank = Task(
    description=(
        "Take the candidate companies and produce the single ranked list for {industry}. "
        "Apply the classification rubric and output at most " + str(TOP_N_OVERALL) + " rows overall. "
        "Each row must include a stage label and a confidence score.\n\n"
        "Do the following in order:\n"
        "1) Canonicalize and dedupe entities. Fix parent vs product labeling.\n"
        "2) Assign stage_label using the rubric below. Use available signals. If signals conflict, use the priority order.\n"
        "3) Score importance across the whole industry with a simple blend: scale (employees or revenue), traction or market share, funding stage, and mindshare. "
        "   Break ties by confidence and data recency. Keep the method simple and explain it in one sentence.\n"
        "4) Enforce hard cap of " + str(TOP_N_OVERALL) + " companies. Do not exceed it under any circumstance.\n"
        "5) Product a Markdown table as a final result"
        #"5) QA pass: each company must have at least " + str(MIN_SIGNAL_SOURCES) + " credible sources. Remove rows that do not meet the bar.\n"
        #"6) Produce final JSON and a short Markdown table.\n\n"
        "Classification rubric:\n"
        + CLASSIFICATION_RUBRIC + "\n\n"
        "Output format guide:\n"
        + OUTPUT_SCHEMA_GUIDE + "\n"
    ),
    expected_output=(
        #"Two parts:\n"
        #"1) Final JSON array named `top_list` with at most TOP_N_OVERALL items matching the schema guide.\n"
        "A concise Markdown table with columns: Rank, Company, Sub-industry, Stage\n"
        "Not a JSON array."
        "Don't include confidence level in the markdown table. Make sure the text is concise and to the point."
    ),
    agent=classifier_ranker,
)

In [14]:
crew = Crew(
    agents=[industry_taxonomist, research_analyst, classifier_ranker],
    tasks=[map_subindustries, mine_companies, classify_and_rank],
    verbose=True,
    memory=True,
)

In [18]:
# Simplified voice input - no hanging issues
import sys

def get_input_with_voice_option(prompt, default_value):
    """Get input with voice option, fallback to text input"""
    sys.stdout.write(f"\n=== {prompt} ===\n")
    sys.stdout.write("Choose input method:\n")
    sys.stdout.write("1. Voice input (upload audio file)\n")
    sys.stdout.write("2. Text input (type your answer)\n")
    sys.stdout.flush()
    
    try:
        choice = input("Enter choice (1 or 2, or press Enter for text): ").strip()
        
        if choice == "1":
            sys.stdout.write("\nVoice input: Please provide the path to your audio file.\n")
            sys.stdout.write("Record audio on your phone/computer, save it, then provide the file path.\n")
            sys.stdout.flush()
            
            audio_data = record_audio_input()
            if audio_data:
                result = transcribe_audio(audio_data, "en")
                if result:
                    sys.stdout.write(f"Transcribed: {result}\n")
                    sys.stdout.flush()
                    return result
                else:
                    sys.stdout.write("Transcription failed. Falling back to text input.\n")
                    sys.stdout.flush()
            else:
                sys.stdout.write("No audio file provided. Falling back to text input.\n")
                sys.stdout.flush()
        
        # Text input fallback
        user_input = input(f"Enter {prompt.lower()} (or press Enter for default): ").strip()
        return user_input if user_input else default_value
        
    except KeyboardInterrupt:
        sys.stdout.write("\nInput cancelled. Using default value.\n")
        sys.stdout.flush()
        return default_value
    except Exception as e:
        sys.stdout.write(f"Error: {e}. Using default value.\n")
        sys.stdout.flush()
        return default_value

# Get all inputs
industry = get_input_with_voice_option("Industry Input", "fintech")
region = get_input_with_voice_option("Region Input", "United States")
time_window = get_input_with_voice_option("Time Window Input", "last 12 months")

# Display final values
sys.stdout.write(f"\nFinal inputs:\n")
sys.stdout.write(f"Industry: {industry}\n")
sys.stdout.write(f"Region: {region}\n")
sys.stdout.write(f"Time Window: {time_window}\n")
sys.stdout.flush()



=== Industry Input ===
Choose input method:
1. Voice input (upload audio file)
2. Text input (type your answer)

Voice input: Please provide the path to your audio file.
Record audio on your phone/computer, save it, then provide the file path.
Voice input: Please provide the path to your audio file.
Supported formats: mp3, wav, m4a, ogg, flac
Example: /path/to/your/audio.mp3 or ./audio.wav
Note: Audio will be transcribed in English.
File not found: /inputs/fintech.wav
No audio file provided. Falling back to text input.

=== Region Input ===
Choose input method:
1. Voice input (upload audio file)
2. Text input (type your answer)

Voice input: Please provide the path to your audio file.
Record audio on your phone/computer, save it, then provide the file path.
Voice input: Please provide the path to your audio file.
Supported formats: mp3, wav, m4a, ogg, flac
Example: /path/to/your/audio.mp3 or ./audio.wav
Note: Audio will be transcribed in English.
Successfully loaded audio file: ./inpu

In [19]:
# Simple text-only input (backup option if voice input has issues)
# Uncomment and run this cell if the voice input cell above doesn't work properly

# industry = input("Enter an industry you'd like to research (or press Enter for 'fintech'): ").strip()
# if not industry:
#     industry = "fintech"

# region = input("Enter the region you'd like to focus on (or press Enter for 'United States'): ").strip()
# if not region:
#     region = "United States"

# time_window = input("Enter the time window you'd like to consider (or press Enter for 'last 12 months'): ").strip()
# if not time_window:
#     time_window = "last 12 months"

# print(f"Industry: {industry}")
# print(f"Region: {region}")
# print(f"Time Window: {time_window}")

print("This is a backup cell. Uncomment the lines above if you need simple text input.")

This is a backup cell. Uncomment the lines above if you need simple text input.


In [20]:
# Example of how you might kick it off at runtime:
import signal
import time

def timeout_handler(signum, frame):
    raise TimeoutError("CrewAI execution timed out after 10 minutes")

# Set a 10-minute timeout
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(600)  # 10 minutes

try:
    print(f"Starting research for {industry} in {region} over {time_window}...")
    print("This may take several minutes. Please be patient...")
    
    result = crew.kickoff(
         inputs={
             "industry": industry,
             "region": region,
             "time_window": time_window,
         }
     )
    print("Research completed successfully!")
    print(result)
    
except TimeoutError:
    print("Research timed out after 10 minutes. Try reducing the scope or simplifying the tasks.")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    signal.alarm(0)  # Cancel the alarm

Starting research for fintech in United States over last 12 months...
This may take several minutes. Please be patient...


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Research completed successfully!
| Rank | Company       | Sub-industry                  | Stage            |
|------|---------------|-------------------------------|------------------|
| 1    | MoneyLion     | LendTech                      | Early-stage      |
| 2    | Acorns        | WealthTech                    | Early-stage      |
| 3    | Wealthfront   | WealthTech                    | Early-stage      |
| 4    | Hippo         | InsurTech                     | Mid-stage        |
| 5    | Root Insurance| InsurTech                     | Mid-stage        |
| 6    | Figure Technologies | LendTech               | Late-stage       |
| 7    | Marqeta       | Embedded Finance              | Late-stage       |
| 8    | Plaid         | Open Banking & Data Aggregation| Late-stage      |
| 9    | Chime         | Embedded Finance / Digital Payments | Late-stage  |
| 10   | SoFi          | LendTech / Digital Payments   | Late-stage       |
| 11   | LendingClub   | LendTech                     |

In [21]:
# Generate voice output for the research results
if 'result' in locals() and result:
    print("\n=== Generating Voice Output ===")
    
    # Extract text from result for TTS
    result_text = str(result)
    
    # Create a summary for voice output (limit length for TTS)
    if len(result_text) > 4000:  # TTS has character limits
        # Try to extract just the markdown table if it exists
        import re
        table_match = re.search(r'\|.*\|.*\|.*\|.*\|', result_text, re.DOTALL)
        if table_match:
            table_text = table_match.group(0)
            # Create a summary
            summary = f"Research completed for {industry} in {region} over {time_window}. "
            summary += f"Here are the top companies found: {table_text[:2000]}..."
        else:
            summary = f"Research completed for {industry} in {region} over {time_window}. "
            summary += f"Results: {result_text[:3500]}..."
    else:
        summary = f"Research completed for {industry} in {region} over {time_window}. Results: {result_text}"
    
    # Generate and play speech
    print("Converting results to speech...")
    audio_content = text_to_speech(summary)
    
    if audio_content:
        print("Playing audio results...")
        audio_file = play_and_save_audio(audio_content, "research_results")
        print(f"\nVoice output completed! Audio saved to: {audio_file}")
    else:
        print("Failed to generate voice output.")
else:
    print("No results available for voice output.")



=== Generating Voice Output ===
Converting results to speech...
Playing audio results...


Audio saved to: /Users/boris/Library/CloudStorage/OneDrive-HarvardBusinessSchool/_EC Year/AI Venture Studio/Codebase/aistudio/HW4/research_results_20251007_212207.mp3

Voice output completed! Audio saved to: /Users/boris/Library/CloudStorage/OneDrive-HarvardBusinessSchool/_EC Year/AI Venture Studio/Codebase/aistudio/HW4/research_results_20251007_212207.mp3
