In [13]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import cross_validate, train_test_split, learning_curve, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [14]:
data_path=os.path.join(os.path.abspath(os.getcwd()),'raw_data')
df=pd.read_csv(os.path.join(data_path,'startups_modified.csv'))

In [15]:
df.head()

Unnamed: 0,num_funding_rounds,last_equity_funding_total,employeeCount,Advertising,Agriculture and Farming,Clothing and Apparel,Commerce and Shopping,Community and Lifestyle,Computer Hardware,Consumer Electronics,...,series_b,series_c,post_ipo_equity,series_d,series_e,series_f,series_g,series_h,days_between_dates,Target
0,1,652650.0,31.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,880,0.0
1,2,50000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,2159,0.0
2,1,500000.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
3,1,120000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,158,0.0
4,2,9600000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,354,0.0


In [19]:
df.columns.to_list()

['num_funding_rounds',
 'last_equity_funding_total',
 'employeeCount',
 'Advertising',
 'Agriculture and Farming',
 'Clothing and Apparel',
 'Commerce and Shopping',
 'Community and Lifestyle',
 'Computer Hardware',
 'Consumer Electronics',
 'Consumer Goods',
 'Content and Publishing',
 'Data and Analytics',
 'Design',
 'Education',
 'Energy',
 'Environment and Sustainability',
 'Events',
 'Financial Services',
 'Food and Beverage',
 'Gaming',
 'Government and Military',
 'Health Care',
 'HumanResources',
 'Legal',
 'Life Sciences',
 'Logistics',
 'Manufacturing',
 'Media and Entertainment',
 'Messaging and Telecommunications',
 'Music and Audio',
 'Natural Resources',
 'Navigation and Mapping',
 'Payments',
 'Privacy and Security',
 'Professional Services',
 'Real Estate and Construction',
 'Sales and Marketing',
 'Software_x',
 'Sports',
 'Transportation',
 'Travel and Tourism',
 'Video',
 'AR and VR',
 'Artificial Intelligence',
 'Biotechnology',
 'BlockChain',
 'Hardware',
 'Scienc

# Data preparation

In [4]:
df=df[df.select_dtypes(include=['int64', 'float64']).columns.to_list()]
df.fillna(0, inplace =True)

X=df.drop(columns='Target', axis=1)

y=df.Target

In [5]:
scaler=StandardScaler()
columns_to_scale=['num_funding_rounds', 'last_equity_funding_total', 'employeeCount','Round 1','Round 2', 'Round 3', 'Round 4', 'Round 5']
X[columns_to_scale]=scaler.fit_transform(X[columns_to_scale])

In [6]:
X.head()

Unnamed: 0,num_funding_rounds,last_equity_funding_total,employeeCount,Advertising,Agriculture and Farming,Clothing and Apparel,Commerce and Shopping,Community and Lifestyle,Computer Hardware,Consumer Electronics,...,equity_crowdfunding,series_b,series_c,post_ipo_equity,series_d,series_e,series_f,series_g,series_h,days_between_dates
0,-0.63731,-0.086773,-0.051789,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,880
1,-0.001235,-0.086773,-0.109726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,2159
2,-0.63731,-0.081532,-0.109726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.63731,-0.086773,-0.109726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,158
4,-0.001235,-0.086773,-0.123631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,354


In [7]:
X.head()

Unnamed: 0,num_funding_rounds,last_equity_funding_total,employeeCount,Advertising,Agriculture and Farming,Clothing and Apparel,Commerce and Shopping,Community and Lifestyle,Computer Hardware,Consumer Electronics,...,equity_crowdfunding,series_b,series_c,post_ipo_equity,series_d,series_e,series_f,series_g,series_h,days_between_dates
0,-0.63731,-0.086773,-0.051789,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,880
1,-0.001235,-0.086773,-0.109726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,2159
2,-0.63731,-0.081532,-0.109726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.63731,-0.086773,-0.109726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,158
4,-0.001235,-0.086773,-0.123631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,354


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression

In [9]:
model = LogisticRegression(max_iter=1000)

In [10]:
cv_results = cross_validate(model, X, y, cv=5, scoring=['accuracy','recall','precision'])
print(f'test_accuracy: {cv_results["test_accuracy"].mean()}')
print(f'test_precision: {cv_results["test_precision"].mean()}')
print(f'test_recall: {cv_results["test_recall"].mean()}')

test_accuracy: 0.947482650246416
test_precision: 0.908841993677197
test_recall: 0.11639097744360902


## Grid search

In [11]:
grid_dict= {'penalty':['l2', 'none'],
            'C':[0.01, 0.1, 0.5, 1, 1.5],
            'class_weight':['balanced', 'none']}

In [12]:
scoring_list=['accuracy','recall','precision', 'f1']

search = GridSearchCV(
    model,
    grid_dict, 
    scoring = scoring_list[2],
    cv = 5,
    n_jobs=-1
) 

search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:
# Best score
print(f'best score: {search.best_score_}')

# Best Params
print(f'best params: {search.best_params_}')

# Best estimator
print(f'best estimator: {search.best_estimator_}')

In [None]:
#model.fit(X_train,y_train)

model=search.best_estimator_

model.score(X_test,y_test)

In [None]:
y_true = y_test
y_pred = model.predict(X_test)

print('Accuracy =', round(accuracy_score(y_true, y_pred), 3)) # Accuracy
print('Precision =', round(precision_score(y_true, y_pred), 3)) # Precision
print('Recall =', round(recall_score(y_true, y_pred), 3)) # Recall
print('F1 score =', round(f1_score(y_true, y_pred), 3)) # F1 score

In [None]:
results_df = pd.DataFrame({"actual": y_test,"predicted": y_pred}) #Store results in a dataframe

confusion_matrix = pd.crosstab(index= results_df['actual'],columns = results_df['predicted'])

confusion_matrix