In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score,classification_report, confusion_matrix

# Loading the DATA

In [3]:
df = pd.read_csv("startup data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0,0,acquired
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,...,c:16283,1,0,0,1,1,1,4.75,1,acquired
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,...,c:65620,0,0,1,0,0,0,4.0,1,acquired
3,738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,...,c:42668,0,0,0,1,1,1,3.3333,1,acquired
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,...,c:65806,1,1,0,0,0,0,1.0,1,closed


# Preprossessing the DATA

In [4]:
null_counter = df.isnull().sum()
null_counter.sort_values(ascending=False)

closed_at                   588
Unnamed: 6                  493
age_last_milestone_year     152
age_first_milestone_year    152
state_code.1                  1
Unnamed: 0                    0
is_biotech                    0
is_software                   0
is_web                        0
is_mobile                     0
is_enterprise                 0
is_advertising                0
is_gamesvideo                 0
is_ecommerce                  0
is_othercategory              0
is_consulting                 0
is_otherstate                 0
object_id                     0
has_VC                        0
has_angel                     0
has_roundA                    0
has_roundB                    0
has_roundC                    0
has_roundD                    0
avg_participants              0
is_top500                     0
category_code                 0
is_NY                         0
is_TX                         0
first_funding_at              0
latitude                      0
longitud

# DATA Cleaning
**Keeping only necessary features**

In [5]:
cols_to_drop = ['has_roundA','is_top500','has_roundB','has_roundC','has_roundD','state_code','id','Unnamed: 0','Unnamed: 6','closed_at','state_code.1','labels','zip_code','founded_at','longitude','latitude','first_funding_at','last_funding_at','city','name','object_id','age_first_milestone_year', 'age_last_milestone_year']
df.drop(cols_to_drop,axis=1,inplace=True)
null_counter = df.isnull().sum()
null_counter.sort_values(ascending=False)

age_first_funding_year    0
age_last_funding_year     0
avg_participants          0
has_angel                 0
has_VC                    0
is_othercategory          0
is_consulting             0
is_biotech                0
is_ecommerce              0
is_gamesvideo             0
is_advertising            0
is_enterprise             0
is_mobile                 0
is_web                    0
is_software               0
category_code             0
is_otherstate             0
is_TX                     0
is_MA                     0
is_NY                     0
is_CA                     0
milestones                0
funding_total_usd         0
funding_rounds            0
relationships             0
status                    0
dtype: int64

# Building new DATA for needed information

In [6]:
data_1 = df[df['status']=='acquired'].groupby(['category_code']).agg({'status':'count'}).reset_index()
data_1.columns=['category_code','total_success']

data_2 = df[df['status']=='closed'].groupby(['category_code']).agg({'status':'count'}).reset_index()
data_2.columns=['category_code','total_closed']

data_3=df.groupby(['category_code']).agg({'status':'count'}).reset_index()
data_3.columns=['category_code','total_startup']

data_1 = data_1.merge(data_2, on='category_code', how='outer').merge(data_3, on='category_code', how='outer')

data_1['success_rate'] = round((data_1['total_success'] / data_1['total_startup']) * 100, 2)

most_success_rate = data_1.sort_values('success_rate', ascending=False)
most_success_rate

Unnamed: 0,category_code,total_success,total_closed,total_startup,success_rate
31,transportation,2.0,,2,100.0
30,sports,1.0,,1,100.0
14,hospitality,1.0,,1,100.0
18,music,6.0,,6,100.0
13,health,3.0,,3,100.0
32,travel,7.0,1.0,8,87.5
20,news,7.0,1.0,8,87.5
1,analytics,16.0,3.0,19,84.21
26,security,15.0,4.0,19,78.95
8,enterprise,56.0,17.0,73,76.71


# Combaining the DATAs for objectif

In [7]:
combined_data = df.merge(data_1[['category_code', 'success_rate']], on='category_code')
# Cleaning
combined_data.dropna(subset='success_rate',inplace=True)
combined_data

Unnamed: 0,age_first_funding_year,age_last_funding_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,...,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,has_VC,has_angel,avg_participants,status,success_rate
0,2.2493,3.0027,3,3,375000,3,1,0,0,0,...,0,0,0,0,1,0,1,1.00,acquired,100.00
1,2.1616,5.6384,13,4,41500019,5,0,1,0,0,...,0,0,0,0,1,0,0,4.00,acquired,100.00
2,3.8849,5.8630,9,2,6700000,4,0,1,0,0,...,0,0,0,0,1,1,0,8.00,acquired,100.00
3,0.8356,5.7534,13,5,24750000,3,1,0,0,0,...,0,0,0,0,1,1,1,2.40,acquired,100.00
4,1.9041,1.9041,13,1,1500000,3,0,1,0,0,...,0,0,0,0,1,0,0,1.00,acquired,100.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,0.8329,0.8329,5,1,154000,2,1,0,0,0,...,0,0,0,0,1,0,1,1.00,closed,33.33
919,0.5808,0.5808,13,1,500000,0,1,0,0,0,...,0,0,0,0,1,0,1,1.00,closed,33.33
920,-0.1041,6.3644,13,5,23500000,3,0,0,0,1,...,0,0,0,0,1,1,0,1.75,acquired,100.00
921,0.6658,5.1315,15,4,41000000,2,0,0,1,0,...,0,0,0,0,1,1,0,1.75,acquired,100.00


# Building the success prediction model

In [8]:
inputs = pd.get_dummies(combined_data, columns=combined_data.columns.tolist(), drop_first=True)

output = combined_data['success_rate']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [10]:
X_train, X_test, y_train, y_test = train_test_split(inputs, output, test_size=0.1, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

import math
rmse = math.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# new_data = ... # Prepare new data for prediction
# predicted_success_rate = model.predict(new_data)

Mean Squared Error: 0.0003493844086019251
Root Mean Squared Error: 0.0186918273211028


In [11]:
import joblib
joblib.dump(model, 'businessRater.pkl')

['businessRater.pkl']

In [12]:
def predictSuccess(input_array):
    saved_model = joblib.load('businessRater.pkl')
    successRate = saved_model.predict(input_array)
    return successRate

In [21]:
input_array = X_test.iloc[[4]]
success_rate = predictSuccess(input_array)[0]
print(success_rate)

65.81999999999992


# Building the funding estimation model

## Filtring/ Creating the DATA for objectif

In [100]:
rate_range = [success_rate - 10, success_rate + 10]
# Filter the DataFrame based on the success rate range
filtered_df = combined_data[combined_data['success_rate'].between(rate_range[0],rate_range[1])]

filtered_df

Unnamed: 0,age_first_funding_year,age_last_funding_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,...,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,has_VC,has_angel,avg_participants,status,success_rate
79,1.0329,1.0329,5,1,2600000,2,1,0,0,0,...,0,0,0,0,0,0,0,4.00,acquired,64.58
80,1.6712,4.6849,14,3,5750000,4,1,0,0,0,...,0,0,0,0,0,1,1,1.00,acquired,64.58
81,1.0849,5.3370,8,5,10400000,2,1,0,0,0,...,0,0,0,0,0,1,1,1.75,closed,64.58
82,4.9041,4.9041,0,1,350000,0,0,0,0,0,...,0,0,0,0,0,1,0,1.00,closed,64.58
83,0.7425,1.5808,10,3,4575000,3,0,1,0,0,...,0,0,0,0,0,0,1,5.00,acquired,64.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,2.1616,4.9562,2,2,4000000,0,0,0,0,0,...,0,0,0,0,1,0,0,2.00,closed,63.64
899,0.7671,0.7671,1,1,50000,1,1,0,0,0,...,0,0,0,0,1,0,1,1.00,closed,63.64
900,0.0000,2.7205,4,4,3800000,4,0,0,0,1,...,0,0,0,0,1,0,1,4.75,acquired,63.64
901,0.9397,0.9397,4,1,100000,2,1,0,0,0,...,0,0,0,0,1,0,1,3.00,acquired,63.64


In [114]:
new_data = filtered_df.loc[:, ['funding_total_usd', 'success_rate']]
new_data

Unnamed: 0,funding_total_usd,success_rate
79,2600000,64.58
80,5750000,64.58
81,10400000,64.58
82,350000,64.58
83,4575000,64.58
...,...,...
898,4000000,63.64
899,50000,63.64
900,3800000,63.64
901,100000,63.64


## Funding estimation model

In [120]:
X = new_data[['success_rate']]
y = new_data['funding_total_usd']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse) # dealing with high values means high MSE

predicted_funding_total_usd = model.predict([[success_rate]])
print(f"Predicted Funding Total USD for {success_rate}:", predicted_funding_total_usd)

Mean Squared Error: 945232141729949.2
Predicted Funding Total USD for 65.81999999999992: [90604045.95203735]




In [121]:
joblib.dump(model, 'fundingEstimer.pkl')

['fundingEstimer.pkl']

In [122]:
def estimateFund(success_rate):
    saved_model = joblib.load('fundingEstimer.pkl')
    predicted_funding_total_usd = saved_model.predict([[success_rate]])
    return predicted_funding_total_usd