In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score

import seaborn as sns

import statsmodels.api as sm
from scipy.stats import norm, t
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold


In [2]:
# df produced from EDA.ipynb

df = pd.read_csv('data/model_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,category,main_category,currency,state,backers,country,usd_pledged_real,usd_goal_real,comments_count,updates_count,spotlight,staff_pick,blurb,launch_year,launch_month,launch_day,active_days
0,0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,0,0,GB,0.0,1533.95,0,0,0,0,A series of poems about pure love pursued by ...,2015,8,11,58
1,1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,0,15,US,2421.0,30000.0,0,11,0,1,Back artists creating short movies of Greeting...,2017,9,2,59
2,2,Where is Hank?,Narrative Film,Film & Video,USD,0,3,US,220.0,45000.0,0,0,0,0,"Abandoned Property. Sounds of struggle, muffle...",2013,1,12,44
3,3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,0,1,US,1.0,5000.0,0,8,0,0,Keeping Great Reggae Music Alive Through RAW T...,2012,3,17,29
4,4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,0,14,US,1283.0,19500.0,1,1,0,1,Community Film Project is creating a group tha...,2015,7,4,55


## Split Train/Test Set

In [3]:
# df without or 'backers' or 'usd_pledged_real' causing data leakage

df_copy = df.drop(['backers', 'usd_pledged_real'], axis=1)

In [4]:
# Use usd_pledged_real / backers as regression target

# df_copy2 = df.drop(['state', backers])
# df_copy3 = df.drop(['state', usd_pledged_real])

In [5]:
# get dummies for main_category, currency, and country

df_copy = pd.get_dummies(df_copy, columns= ['category','main_category', 'currency', 'country'], drop_first = True)

In [6]:
# all numerical data types
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354477 entries, 0 to 354476
Columns: 220 entries, Unnamed: 0 to country_US
dtypes: float64(1), int64(10), object(2), uint8(207)
memory usage: 105.1+ MB


In [7]:
#split data into target and features
y = df_copy['state']
X = df_copy.loc[:, df_copy.columns != 'state']

In [8]:
#give % of each
y.value_counts() / y.shape

0    0.625513
1    0.374487
Name: state, dtype: float64

In [9]:
# Test train split to create holdout set (X_test, y_test) with 20% of data
# stratify maintains the class balance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)


In [10]:
# Split again on training
X_split_train, X_split_test, y_split_train, y_split_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True, stratify=y_train)


In [11]:
# Feature Dataframes without 'name' or 'blurb' column
# y_train, y1_train, y2_train are all equal

X1_train = X_split_train.drop(['name', 'blurb'], axis=1)
y1_train = y_split_train
X1_test = X_split_test.drop(['name', 'blurb'], axis=1)
y1_test = y_split_test

In [12]:
#NLP Dataframe

X2_train = X_split_train[['name', 'blurb']];
y2_train = y_split_train
X2_test = X_split_train[['name', 'blurb']];
y2_test = y_split_test

### Grid Search / Random Forest Model

In [13]:
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[1,2,None], 'max_features': [1, 'sqrt', 'log2', None]} #max_depth 3 different times
rfc = RandomForestClassifier(n_estimators=50) #number of trees (default 100)
#gridsearch Cross Validate
gs = GridSearchCV(rfc, parameters, scoring = ['accuracy','f1', 'roc_auc'], refit='accuracy', n_jobs = -1) 
# other scoring strings
# https://scikit-learn.org/stable/modules/model_evaluation.html #model-evaluation
# refit makes it the primary ranking score out of the list we gave
# n_jobs = -1 runs the process using all cores on your computer

gs.fit(X1_train, y1_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False,
                                              ran

In [14]:
#5 K-folds done by default

pd.DataFrame(gs.cv_results_).T

# Shows Entropy is better than gini looking at mean_test_score

# 3 Done for Gini and Entropy for each depth (1,2, None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
mean_fit_time,6.41585,8.70566,7.58152,39.2989,5.14215,9.54869,7.09921,62.0257,52.3381,46.3303,...,6.15341,38.7457,5.43957,9.47567,7.18942,67.3684,56.8362,49.3934,32.5812,34.4742
std_fit_time,0.240377,0.194854,0.111228,0.638871,0.0896048,0.162302,0.120981,1.71175,1.68328,3.00979,...,0.159022,0.0570856,0.149442,0.2812,0.110519,0.411197,1.50259,1.90411,7.65487,1.8809
mean_score_time,0.88958,0.869508,0.858197,1.00018,0.92748,0.983894,0.9969,1.01086,6.18034,2.59517,...,0.968233,0.94951,0.977963,0.98714,0.986854,0.968767,5.82309,2.42404,1.65246,0.292456
std_score_time,0.0238986,0.0146719,0.0190117,0.0112057,0.0049385,0.0110207,0.0195085,0.0418517,0.121371,0.100384,...,0.00821977,0.0230358,0.0368376,0.0137542,0.0280613,0.0321957,0.138896,0.488105,0.329649,0.0430857
param_criterion,gini,gini,gini,gini,gini,gini,gini,gini,gini,gini,...,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy
param_max_depth,1,1,1,1,2,2,2,2,,,...,1,1,2,2,2,2,,,,
param_max_features,1,sqrt,log2,,1,sqrt,log2,,1,sqrt,...,log2,,1,sqrt,log2,,1,sqrt,log2,
params,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': None, 'max_...","{'criterion': 'gini', 'max_depth': None, 'max_...",...,"{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm..."
split0_test_accuracy,0.625504,0.835144,0.625504,1,0.625504,0.882463,0.634518,1,0.999185,1,...,0.625504,1,0.625504,0.933132,0.650828,1,0.998612,1,1,1
split1_test_accuracy,0.625504,0.626033,0.625504,0.999978,0.625504,0.82948,0.625747,0.999978,0.999559,0.999978,...,0.625504,0.999978,0.625504,0.912768,0.625504,0.999978,0.999273,0.999978,0.999934,0.999978


In [15]:
# gives best parameters
# Max_features = None

gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=1, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
gs.best_score_

0.9999823682717033

In [17]:
#best hyperparameters

gs.best_params_

{'criterion': 'gini', 'max_depth': 1, 'max_features': None}

In [18]:
gs.scorer_

{'accuracy': make_scorer(accuracy_score),
 'f1': make_scorer(f1_score, average=binary),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}

In [19]:
hp = {'n_estimators': 50, 'oob_score': True}
hyperparameters = {**gs.best_params_, **hp}
hyperparameters

{'criterion': 'gini',
 'max_depth': 1,
 'max_features': None,
 'n_estimators': 50,
 'oob_score': True}

In [20]:
model = RandomForestClassifier(**hyperparameters).fit(X1_train, y1_train)

In [21]:
model.oob_score_

0.9999823682911347

In [22]:
importances = model.feature_importances_   ;importances

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
indicies = importances.argsort()[::-1] # shows indexes from most important to least
indicies

array([  4, 216,  67,  78,  77,  76,  75,  74,  73,  72,  71,  70,  69,
        68,  66,  80,  65,  64,  63,  62,  61,  60,  59,  58,  57,  56,
        79,  81,  54,  82, 105, 104, 103, 102, 101, 100,  99,  98,  97,
        96,  95,  94,  93,  92,  91,  90,  89,  88,  87,  86,  85,  84,
        83,  55,  53, 107,  26,  24,  23,  22,  21,  20,  19,  18,  17,
        16,  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   3,
         2,   1,  25,  27,  52,  28,  51,  50,  49,  48,  47,  46,  45,
        44,  43,  42,  41,  40,  39,  38,  37,  36,  35,  34,  33,  32,
        31,  30,  29, 106, 108, 215, 176, 187, 186, 185, 184, 183, 182,
       181, 180, 179, 178, 177, 175, 109, 174, 173, 172, 171, 170, 169,
       168, 167, 166, 165, 164, 188, 189, 190, 191, 214, 213, 212, 211,
       210, 209, 208, 207, 206, 205, 204, 203, 202, 201, 200, 199, 198,
       197, 196, 195, 194, 193, 192, 163, 162, 161, 134, 132, 131, 130,
       129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 11

In [24]:
# top 10 features in model

X.columns[indicies].tolist()[:10]

['updates_count',
 'country_SE',
 'category_Fine Art',
 'category_Hardware',
 'category_Graphic Novels',
 'category_Graphic Design',
 'category_Glass',
 'category_Gaming Hardware',
 'category_Games',
 'category_Gadgets']

In [25]:
# what is important for each individual tree in the forest
# model.estimators is all of your trees

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis = 0)
std

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [26]:
# plt.bar(X.columns, importances[indicies], color="#33ffcc", yerr=std[indicies], align="center", alpha=0.7, error_kw=dict(ecolor='blue', lw=1, capsize=5, capthick=2))
# plt.ylim([0, 1]);

## Logistic Regression Model

In [28]:
# Baseline Model

model = LogisticRegression(max_iter=300)
model.fit(X1_train,y1_train)

y_hat = model.predict(X1_train)
model.score(X1_test,y1_test)
# model.coef_  #show coefficients

0.8433450288273358