In [7]:
import pandas as pd
import os

# Since we're in a notebook, we need to set the project root manually
# Adjust this path according to your notebook's location relative to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Load the datasets
unsuccessful_path = os.path.join(project_root, 'data', 'Merged_Unsuccessful_V2.csv')
successful_path = os.path.join(project_root, 'data', 'Merged_Successful_V2.csv')

unsuccessful_df = pd.read_csv(unsuccessful_path)
successful_df = pd.read_csv(successful_path)

# Add a 'success' column to each dataset
unsuccessful_df['success'] = 0
successful_df['success'] = 1

# Randomly sample 40 rows from unsuccessful and 10 from successful
unsuccessful_sample = unsuccessful_df.sample(n=40, random_state=42)
successful_sample = successful_df.sample(n=10, random_state=42)

# Concatenate the samples
merged_sample = pd.concat([unsuccessful_sample, successful_sample], ignore_index=True)

# Shuffle the merged dataset
merged_sample = merged_sample.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the merged sample
output_path = os.path.join(project_root, 'data', 'Experiment_Dataset.csv')
merged_sample.to_csv(output_path, index=False)

print(f"Merged dataset with 50 samples saved to {output_path}")
print(f"Dataset shape: {merged_sample.shape}")
print(f"Successful companies: {merged_sample['success'].sum()}")
print(f"Unsuccessful companies: {len(merged_sample) - merged_sample['success'].sum()}")

Merged dataset with 50 samples saved to /Users/wangxiang/Desktop/Startup-Success-Forecasting-Framework/data/Experiment_Dataset.csv
Dataset shape: (50, 18)
Successful companies: 10
Unsuccessful companies: 40


In [8]:
print(merged_sample.head(1))

   Unnamed: 0 org_name                              org_uuid  \
0        3080  Mytower  076d86c0-0ddb-414e-a7d1-4021d82e4f95   

                                founder_linkedin_url  \
0  https://www.linkedin.com/in/meiri-shemesh-b673...   

                                         json_string  \
0  {"version": 1, "hits": 1, "results": 1, "kgver...   

                                     structured_info  \
0  {'name': 'Meiri Shemesh', 'gender': '', 'birth...   

                                           paragraph          domain  \
0  Meiri Shemesh is known for their contribution ...  mytowerapp.com   

      status founded_on                                      category_list  \
0  operating     1/1/16  Internet of Things,Property Development,Proper...   

                     category_groups_list country_code      city  \
0  Internet Services,Real Estate,Software          ISR  Tel Aviv   

                                   short_description  \
0  A Unified All-in-One Innovative Pr

In [9]:
import sys
import os
import json
import pandas as pd
import toml
from pathlib import Path
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# Load secrets
secrets_path = Path(os.getcwd()).parent / '.streamlit' / 'secrets.toml'
if secrets_path.exists():
    with open(secrets_path, 'r') as f:
        secrets = toml.load(f)
    logger.info(f"Secrets loaded from {secrets_path}")
else:
    logger.warning(f"No secrets file found at {secrets_path}")
    secrets = {}

# Set secrets as environment variables
for key, value in secrets.items():
    os.environ[key] = str(value)

# Import your framework
from ssff_framework import StartupFramework

# Load the experiment dataset
input_path = os.path.join(project_root, 'data', 'Experiment_Dataset.csv')
df = pd.read_csv(input_path)

# Initialize the StartupFramework
framework = StartupFramework()

# Function to flatten nested dictionaries
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# List to store results
results = []

# Process each row in the dataset
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing companies"):
    logger.info(f"Processing company {index + 1}/{len(df)}")
    
    # Prepare startup info string
    startup_info_str = f"""
    {row['long_description']}
    Founder background: {row['paragraph']}
    """
    
    # Run analysis
    analysis_result = framework.analyze_startup(startup_info_str)
    
    # Flatten nested dictionaries in the result
    flat_result = flatten_dict(analysis_result)
    
    # Add input data to the result
    flat_result['input_description'] = row['long_description']
    flat_result['input_founder_background'] = row['paragraph']
    flat_result['input_success'] = row['success']
    
    results.append(flat_result)
    
    # Log summary of the result
    logger.info(f"Company {index + 1} analysis summary:")
    logger.info(f"Prediction: {flat_result.get('Categorical Prediction', 'N/A')}")
    logger.info(f"Overall Score: {flat_result.get('Final Decision_overall_score', 'N/A')}")
    logger.info(f"Recommendation: {flat_result.get('Final Decision_recommendation', 'N/A')}")
    logger.info("---")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
output_path = os.path.join(project_root, 'data', 'Experiment_Results.csv')
results_df.to_csv(output_path, index=False)

logger.info(f"Analysis complete. Results saved to {output_path}")
logger.info(f"Total rows processed: {len(results_df)}")
logger.info(f"Number of columns in result: {len(results_df.columns)}")

# Display a sample of the results
logger.info("\nSample of results:")
logger.info(results_df[['Categorical Prediction', 'Final Decision_overall_score', 'Final Decision_recommendation']].head())

2024-11-26 19:18:58,353 - INFO - Secrets loaded from /Users/wangxiang/Desktop/Startup-Success-Forecasting-Framework/.streamlit/secrets.toml


KeyboardInterrupt: 

In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the results
results_path = '/Users/wangxiang/Desktop/Startup-Success-Forecasting-Framework/data/Experiment_Results_Final.csv'
df = pd.read_csv(results_path)

df.head(1)

Unnamed: 0,Final Analysis_overall_score,Final Analysis_summary,Final Analysis_strengths,Final Analysis_weaknesses,Final Analysis_recommendation,Final Analysis_outcome,Market Analysis_market_size,Market Analysis_growth_rate,Market Analysis_competition,Market Analysis_market_trends,...,Startup Info_team_dynamics,Startup Info_web_traffic_growth,Startup Info_social_media_presence,Startup Info_investment_rounds,Startup Info_regulatory_approvals,Startup Info_patents,input_description,input_founder_background,input_success,error
0,7.0,The startup's mytower platform demonstrates st...,['Comprehensive feature set tailored for high-...,['Potential gaps in technology expertise withi...,It is advisable to further investigate potenti...,Unsuccessful,The exact current market size for smart buildi...,The market is expected to experience a signifi...,The competitive landscape features various pla...,Key trends include the rise of smart apartment...,...,,,,,,,A Unified All-in-One Innovative Property Manag...,Meiri Shemesh is known for their contribution ...,0,


### Baseline Framework: Run the baseline across 50 startups and store them into a CSV file

In [20]:
import pandas as pd
import os
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Check if the variable is loaded
api_key = os.getenv("OPENAI_API_KEY")

# Set the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

startup_data = pd.read_csv(os.path.join(project_root, 'data', 'Experiment_Dataset.csv'))

# initialise the baseline framework
from baseline_framework import BaselineFramework

baseline_framework = BaselineFramework()

print(startup_data.head(1))
print(startup_data.shape)

# #Process each startup
# logger.info("Starting startup analysis...")

# for idx, row in tqdm(startup_data.iterrows(), total=len(startup_data), desc="Processing startups"):
#     try:
#         # Create startup info string
#         startup_info_str = f"""
#         {row['long_description']}
#         Founder background: {row['paragraph']}
#         """
        
#         # Run analysis
#         analysis_result = baseline_framework.analyze_startup(startup_info_str)
#     except Exception as e:
#         print(f"Error processing startup {idx}: {str(e)}")
        
  



FileNotFoundError: No secrets found. Valid paths for a secrets.toml file or secret directories are: /Users/wangxiang/.streamlit/secrets.toml, /Users/wangxiang/Desktop/Startup-Success-Forecasting-Framework/experiments/.streamlit/secrets.toml

In [18]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Check if the variable is loaded
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")