## Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pylab as plt
import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
#load data and replace nulls
df = pd.read_csv('Startup Data.csv')
df = df.replace('NaN', np.nan) 

#row 124 and 832 are repeats of the same data point
df.iloc[124] == df.iloc[832]
df = df.drop(832)

FileNotFoundError: ignored

In [None]:
#check for unique id columns that can be ignored
print(df['Unnamed: 0'].value_counts().shape == df['Unnamed: 0'].shape)
print(df['id'].value_counts().shape == df['id'].shape)
print(df['name'].value_counts().shape == df['name'].shape)
print(df['object_id'].value_counts().shape == df['object_id'].shape)

#state code is repeated twice with one mismatched state name
print((df['state_code.1'] == df['state_code']).value_counts())

del df['Unnamed: 0']
del df['id']
del df['name']
del df['object_id']
del df['state_code.1']

In [None]:
df.head()

## EDA

In [None]:
#histograms/bar charts of all features
columns = df.columns
for col in columns:
    if col in ss_ftrs:
        print(df[col].describe())
        df[col].plot.hist()
        plt.xlabel(col)
        plt.ylabel('count')
        plt.show()

In [None]:
#bar plots

In [None]:
#comparing how well california startups did relative to the rest of the country
count_matrix = df.groupby(['is_CA', 'status']).size().unstack()
count_matrix_norm = count_matrix.div(count_matrix.sum(axis=1),axis=0)
count_matrix_norm.plot(kind='bar', stacked=True, color = ['k', 'grey'])
plt.xlabel('Outside California (0), In California (1)')
plt.ylabel('fraction of startups')
plt.legend(loc=4)
plt.show()

count_matrix = df.groupby(['is_CA', 'has_VC']).size().unstack()
count_matrix_norm = count_matrix.div(count_matrix.sum(axis=1),axis=0)
count_matrix_norm.plot(kind='bar', stacked=True, color = ['k', 'grey'])
plt.xlabel('In California')
plt.ylabel('fraction with VC funding')
plt.legend(loc=4)
plt.show()

In [None]:
#histogram of the industries
df['category_code'].value_counts().plot.bar(color = 'turquoise')
plt.ylabel('number of companies')
plt.show()

In [None]:
#plot showing industry type against total funding amount in usd
df.groupby('category_code', as_index=True)['funding_total_usd'].mean().plot.bar(color = 'k')
plt.ylabel('total funding')
plt.show()

In [None]:
#comparing age of first funding year and age at first milestone shows a high concentration near (0,0)
#this is likely due to many companies who do not recieve any funding
#it is interesting to note the few outliers that were funded much later in their lives
df.plot.scatter('age_first_funding_year', 'age_first_milestone_year', s=10, alpha=.1)
plt.show()

In [None]:
#funding vs status
count_matrix = df.groupby(['has_VC', 'status']).size().unstack()
count_matrix_norm = count_matrix.div(count_matrix.sum(axis=1),axis=0)
count_matrix_norm.plot(kind='bar', stacked=True, color = ['turquoise', 'lavender'])
plt.ylabel('status of company')
plt.xlabel('VC funding')
plt.legend(loc=4)
plt.show()

In [None]:
df.groupby(['has_VC', 'status']).size().unstack()

## Split Data

In [None]:
#set X and y matrices
y = df['status']
X = df.loc[:, df.columns != 'status']

In [None]:
#data split
random_state = 7

#separate out training set
X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,random_state=random_state)

#split out validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X,y,train_size = 0.5,random_state=random_state)

## Missing Data

In [None]:
perc_missing_per_ftr = df.isnull().sum(axis=0)/df.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(df[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(df.isnull().sum(axis=1)!=0)/df.shape[0]
print('fraction of points with missing values:',frac_missing)

In [None]:
#categorical missing values new category 'missing'
df["closed_at"] = df["closed_at"].replace(np.nan, "missing")
df["Unnamed: 6"] = df["Unnamed: 6"].replace(np.nan, "missing")

In [None]:
#continuous missing values - multivariate imputation
print(df[['age_first_milestone_year','age_last_milestone_year']].head())

imputer = IterativeImputer(estimator = RandomForestRegressor(n_estimators=10), random_state=1000)
X_impute = imputer.fit_transform(df[['age_first_milestone_year','age_last_milestone_year']])
df_imp = pd.DataFrame(data=X_impute, columns = df[['age_first_milestone_year','age_last_milestone_year']])

#print(df_train_imp[['LotFrontage','MasVnrArea','GarageYrBlt']].head())

#df_CV_imp = pd.DataFrame(data=imputer.transform(df_CV), columns = df_train.columns)
#df_test_imp = pd.DataFrame(data=imputer.transform(df_test), columns = df_train.columns)

## Preprocessing

In [None]:
#all remaining features split between categorical/continuous/dates to be transformed to continuous
#categorical features split further between ordinal scaler and one hot encoder

cat_ftrs = ['state_code', 'zip_code', 'city','labels', 'is_CA','is_NY','is_MA', 
        'is_TX', 'is_otherstate', 'category_code', 'is_software', 'is_web','is_mobile', 
        'is_enterprise','is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 
        'is_consulting', 'is_othercategory', 'avg_participants', 'has_VC', 'has_angel', 'has_roundA', 
        'has_roundB', 'has_roundC', 'has_roundD','is_top500', 'milestones']

ss_ftrs = ['latitude', 'longitude', 'age_first_funding_year', 'age_last_funding_year', 
            'age_first_milestone_year', 'age_last_milestone_year', 'relationships', 
            'funding_rounds', 'avg_participants', 'funding_total_usd',]

dates = ['founded_at', 'closed_at', 'first_funding_at', 'last_funding_at']


In [None]:
#transform dates into continuous variables using epoch time

In [None]:


preprocessor = ColumnTransformer(transformers = 
        [('onehot',  OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_ftrs), 
          ('ss', StandardScaler(), ss_ftrs)])

clf = Pipeline(steps = [('preprocessor', preprocessor)])

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

print(X_train_prep[:5])
#for col in cols:
   # if col in std_ftrs:
       # X_train = scaler.fit_transform(X_train[[col]])


#X_train_ohe = enc.fit_transform(X_train[['state_code']])
#X_train_ohe

## Pipeline

In [None]:
def ML_pipeline(X, y, preprocessor, ML_algo, param_grid):
    test_scores = []
    best_models = []
    
    for i in range(0,10):
        # split data to other/test 80/20, and the use KFold with 4 folds (2 points)
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = 7*i)
        kf = KFold(n_splits=4,shuffle=True,random_state=7*i)
        # preprocess the data (1 point)
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', ML_algo)])
        # loop through the hyperparameter combinations or use GridSearchCV (2 points)
        grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                        cv=kf, return_train_score = True)
        # for each combination, calculate the train and validation scores using the evaluation metric
        grid.fit(X_other, y_other)
        # find which hyperparameter combination gives the best validation score (1 point)
        best_combination = grid.best_params_
        # calculate the test score (1 point)
        y_pred = grid.predict(X_test)
        # append the test score and the best model to the lists (1 point)
        test_scores.append(mean_squared_error(y_test, y_pred, squared = False))
        best_models.append(grid.best_estimator_)
    return best_models, test_scores

## Models

In [None]:
ML_algo = SVC()
param_grid = {'regressor__C': np.logspace(-3,4,num=8),'regressor__gamma': np.logspace(-3,4,num=8)}
ML_pipeline(X, y, preprocessor, ML_algo, param_grid)