# Predict Startup Success Project

In [None]:
# Data Manipulation
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Load and view data
data = pd.read_csv('50_Startups.csv')

In [None]:
data.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
data.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [None]:
# Check for null values
data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [None]:
# Define features and label

features = data.iloc[:,:-1].values
label = data.iloc[:,[-1]].values

In [None]:
# Encode "State" using OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers=[
        ("OneHot",        
         OneHotEncoder(), 
         [3]              
         )
    ],
    remainder='passthrough' 
)
features = transformer.fit_transform(features.tolist())
features

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [None]:
# Split dataset into train and test
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=1)

In [None]:
# Apply DecisionTreeRegressor model
DTR = DecisionTreeRegressor(max_depth=3)
DTR.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
print(DTR.score(X_train,y_train))
print(DTR.score(X_test,y_test))

0.9719776806321837
0.8784483297473074


In [None]:
# Test DecisionTreeRegressor model with varying max_depth
DTR = DecisionTreeRegressor(max_depth=5)
DTR.fit(X_train,y_train)

print(DTR.score(X_train,y_train))
print(DTR.score(X_test,y_test))

0.9995250727754222
0.9230541340901166


In [None]:
DTR = DecisionTreeRegressor(max_depth=7)
DTR.fit(X_train,y_train)

print(DTR.score(X_train,y_train))
print(DTR.score(X_test,y_test))

1.0
0.8905503328260032


In [None]:
DTR = DecisionTreeRegressor(max_depth=9)
DTR.fit(X_train,y_train)

print(DTR.score(X_train,y_train))
print(DTR.score(X_test,y_test))

1.0
0.892453518197833


In [None]:
# Apply RandomForestRegressor model
RF=RandomForestRegressor(n_estimators=3)
RF.fit(X_train,y_train.ravel())
print(RF.score(X_train,y_train))
print(RF.score(X_test,y_test))

0.9630557361911887
0.9520143328776981


In [None]:
for i in range(4,10):
    RF=RandomForestRegressor(n_estimators=3)
    RF.fit(X_train,y_train.ravel())
    print("n_estimator = ",i)
    print("Training Score =",RF.score(X_train,y_train))
    print("TEsting Score = ",RF.score(X_test,y_test))

n_estimator =  4
Training Score = 0.9481676514337896
TEsting Score =  0.8680332361511687
n_estimator =  5
Training Score = 0.9630116555727363
TEsting Score =  0.9477358609531193
n_estimator =  6
Training Score = 0.9665372455400745
TEsting Score =  0.8580292546814693
n_estimator =  7
Training Score = 0.9550414761197068
TEsting Score =  0.8974980801060569
n_estimator =  8
Training Score = 0.9559495999083187
TEsting Score =  0.954793841158082
n_estimator =  9
Training Score = 0.9465297730845903
TEsting Score =  0.9551350729700253


In [None]:
# Cross validate models
from sklearn.model_selection import cross_val_score


In [None]:
DTR_scores = cross_val_score(DTR, X_train, y_train, cv = 10)
RF_scores = cross_val_score(RF, X_train, y_train, cv = 10)

print("mean DTR cross validation score: {}".format(np.mean(DTR_scores)))
print("mean RF cross validation score: {}".format(np.mean(RF_scores)))

mean DTR cross validation score: 0.5647107802701816
mean RF cross validation score: 0.5844238717531504


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Conclusion:

Increasing the maximum depth of the decision tree algorithm did result in higher test data scores, but also resulted in overfitting to the training data. The random forest algorithm with n_estimator of 9 resulted in the best testing score while not being overfit to the training data. Cross validation confirmed that the random forest scored higher than decision tree. Though cross validation scores were significantly lower than the model's test scores, this may be due to the small dataset.

So, the random forest algorithm is the better choice for this problem. The model returned excellent testing scores and can be used to predict the startups from the dataset which are likely to succeed.