In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import numpy as np

In [6]:
PROCESSED_DATA_PATH = r"A:/startup-health-dashboard/data/processed/cleaned_startups.csv"
MODELS_DIR = "models"
MODEL_PATH = os.path.join(MODELS_DIR, "startup_model.pkl")
COLUMNS_PATH = os.path.join(MODELS_DIR, "model_columns.pkl")

In [8]:
df = pd.read_csv(PROCESSED_DATA_PATH)
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (9810, 5)


Unnamed: 0,year,state,industry,count,last_update
0,2024,Assam,Advertising,1,2025-07-31 19:30:00.697
1,2024,Assam,Architecture Interior Design,1,2025-07-31 19:30:00.963
2,2024,Assam,Automotive,3,2025-07-31 19:30:01.478
3,2024,Assam,Chemicals,3,2025-07-31 19:30:02.257
4,2024,Assam,Telecommunication & Networking,2,2025-07-31 19:30:02.767


In [9]:
features = ['year', 'state', 'industry']
target = 'count'

df_model = df[features + [target]].dropna()

encoders = {}
for col in ['state', 'industry']:
    le = LabelEncoder()
    df_model.loc[:, col] = le.fit_transform(df_model[col])  
    encoders[col] = le

X = df_model[features]
y = df_model[target]

print("Features used:", features)
X.head()

Features used: ['year', 'state', 'industry']


Unnamed: 0,year,state,industry
0,2024,3,2
1,2024,3,8
2,2024,3,10
3,2024,3,12
4,2024,3,50


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training size:", X_train.shape, " | Test size:", X_test.shape)

Training size: (7848, 3)  | Test size: (1962, 3)


In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("✅ Model training complete!")

✅ Model training complete!


In [12]:
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print(f"📊 Model Evaluation Results:")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.2f}")

📊 Model Evaluation Results:
MAE  : 8.16
RMSE : 17.51
R²   : 0.84


In [13]:
os.makedirs(MODELS_DIR, exist_ok=True)

joblib.dump(model, MODEL_PATH)
joblib.dump({'columns': list(X.columns), 'encoders': encoders}, COLUMNS_PATH)

print(f"✅ Model saved at {MODEL_PATH}")
print(f"✅ Encoders + Columns saved at {COLUMNS_PATH}")

✅ Model saved at models\startup_model.pkl
✅ Encoders + Columns saved at models\model_columns.pkl


In [14]:
loaded_model = joblib.load(MODEL_PATH)
test_preds = loaded_model.predict(X_test[:5])
print("Sample Predictions:", test_preds)

Sample Predictions: [ 3.95  3.33 26.88 23.48  2.16]
