In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, log_loss, accuracy_score, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import seaborn as sns

import statsmodels.api as sm
from scipy.stats import norm, t
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from src.helpers import *

In [40]:
# df produced from EDA.ipynb

df = pd.read_csv('data/model_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,category,main_category,currency,state,backers,country,usd_pledged_real,usd_goal_real,comments_count,updates_count,spotlight,staff_pick,blurb,reward_count,launch_year,launch_month,launch_day,active_days
0,0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,0,0,GB,0.0,1533.95,0,0,0,0,A series of poems about pure love pursued by ...,1,2015,8,11,58
1,1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,0,15,US,2421.0,30000.0,0,11,0,1,Back artists creating short movies of Greeting...,11,2017,9,2,59
2,2,Where is Hank?,Narrative Film,Film & Video,USD,0,3,US,220.0,45000.0,0,0,0,0,"Abandoned Property. Sounds of struggle, muffle...",16,2013,1,12,44
3,3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,0,1,US,1.0,5000.0,0,8,0,0,Keeping Great Reggae Music Alive Through RAW T...,6,2012,3,17,29
4,4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,0,14,US,1283.0,19500.0,1,1,0,1,Community Film Project is creating a group tha...,8,2015,7,4,55


## Split Train/Test Set

In [3]:
# df without or 'backers' or 'usd_pledged_real' causing data leakage
# spotlight is only for successful projects so this also causes data leakage
# staff_pick varies to occur on day 1 or later on after project launch, causing data leakage

df_copy = df.drop(['Unnamed: 0','name','backers', 'usd_pledged_real', 'spotlight', 'staff_pick'], axis=1)

#dropping year month day due to non-cyclic nature, later use sklearn DictVectorizer
df_copy = df_copy.drop(['launch_year','launch_month', 'launch_day'], axis =1)

In [4]:
# Use usd_pledged_real / backers as regression target

# df_copy2 = df.drop(['state', backers])
# df_copy3 = df.drop(['state', usd_pledged_real])

In [5]:
df_copy.columns

Index(['category', 'main_category', 'currency', 'state', 'country',
       'usd_goal_real', 'comments_count', 'updates_count', 'blurb',
       'active_days'],
      dtype='object')

In [6]:
# get dummies for category, main_category, currency, and country

df_copy = pd.get_dummies(df_copy, columns= ['category','main_category', 'currency', 'country'], drop_first = True)

In [7]:
# all numerical data types
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354477 entries, 0 to 354476
Columns: 213 entries, state to country_US
dtypes: float64(1), int64(4), object(1), uint8(207)
memory usage: 86.2+ MB


In [8]:
#split data into target and features
y = df_copy['state']
X = df_copy.loc[:, df_copy.columns != 'state']

In [9]:
#give % of each
# class imbalance (baseline model accuracy of 62.6% for highest class imbalance)
y.value_counts() / y.shape

0    0.625513
1    0.374487
Name: state, dtype: float64

In [10]:
# Test train split to create holdout set (X_test, y_test) with 20% of data
# stratify maintains the class balance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)


### Grid Search / Random Forest Model

In [11]:
# remove columns with words

X_num_train = X_train.loc[:, X_train.columns != 'blurb']
X_num_test = X_test.loc[:, X_test.columns != 'blurb']

In [12]:
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[1,2,None], 'max_features': [1, 'sqrt', 'log2', None]} #max_depth 3 different times
rfc = RandomForestClassifier(n_estimators=50) #number of trees (default 100)
#gridsearch Cross Validate
gs = GridSearchCV(rfc, parameters, scoring = ['accuracy','f1', 'roc_auc'], refit='accuracy', n_jobs = -1) 
# other scoring strings
# https://scikit-learn.org/stable/modules/model_evaluation.html #model-evaluation
# refit makes it the primary ranking score out of the list we gave
# n_jobs = -1 runs the process using all cores on your computer

gs.fit(X_num_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False,
                                              ran

In [13]:
#5 K-folds done by default

pd.DataFrame(gs.cv_results_).T

# Shows Entropy is better than gini looking at mean_test_accuracy

# 3 Done for Gini and Entropy for each depth (1,2, None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
mean_fit_time,14.2615,16.4969,14.9784,52.443,7.11159,13.3992,10.2213,104.786,136.495,167.603,...,9.278,53.8397,8.66121,13.8198,10.7725,107.466,138.093,176.966,159.787,585.106
std_fit_time,0.141594,0.134374,0.374038,2.38411,0.0813684,0.227437,0.115371,0.711181,0.713674,1.12443,...,0.134621,0.179101,0.248608,0.335233,0.132798,0.2182,1.39729,3.30816,0.909426,42.1449
mean_score_time,1.18343,1.08178,1.0657,1.43917,1.27807,1.37274,1.38741,1.60718,13.3929,11.5661,...,1.43728,1.41936,1.50305,1.4568,1.46268,1.63873,13.7374,11.9898,11.3764,2.27166
std_score_time,0.0162677,0.00829799,0.0128263,0.0148934,0.0587167,0.026254,0.0141111,0.0532555,0.24089,0.544384,...,0.0390109,0.0387066,0.0712461,0.0637382,0.0659265,0.0726393,0.163795,0.203869,1.26026,0.375683
param_criterion,gini,gini,gini,gini,gini,gini,gini,gini,gini,gini,...,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy
param_max_depth,1,1,1,1,2,2,2,2,,,...,1,1,2,2,2,2,,,,
param_max_features,1,sqrt,log2,,1,sqrt,log2,,1,sqrt,...,log2,,1,sqrt,log2,,1,sqrt,log2,
params,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': None, 'max_...","{'criterion': 'gini', 'max_depth': None, 'max_...",...,"{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm..."
split0_test_accuracy,0.625509,0.625509,0.625509,0.814571,0.625509,0.634924,0.625509,0.814571,0.866054,0.870638,...,0.625509,0.807906,0.625509,0.626549,0.625509,0.807906,0.866707,0.870938,0.86981,0.870515
split1_test_accuracy,0.62552,0.62552,0.62552,0.814902,0.62552,0.643258,0.62552,0.814902,0.867198,0.87113,...,0.62552,0.809172,0.62552,0.630968,0.62552,0.809172,0.866916,0.870724,0.869208,0.870142


In [14]:
# gives best parameters
# Max_features = None

gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
gs.best_score_

0.8701111833047713

In [16]:
#best hyperparameters

gs.best_params_

{'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt'}

In [17]:
gs.scorer_

{'accuracy': make_scorer(accuracy_score),
 'f1': make_scorer(f1_score, average=binary),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}

In [18]:
hp = {'n_estimators': 50, 'oob_score': True}
hyperparameters = {**gs.best_params_, **hp}
hyperparameters

{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'n_estimators': 50,
 'oob_score': True}

In [19]:
model = RandomForestClassifier(**hyperparameters).fit(X_num_train, y_train)

In [20]:
model.oob_score_

0.8684361787284761

In [21]:
# obvious data leakage

importances = model.feature_importances_

In [22]:
indicies = importances.argsort()[::-1] # shows indexes from most important to least

In [23]:
# top 10 features in model

X_num_train.columns[indicies].tolist()[:10]

['updates_count',
 'comments_count',
 'usd_goal_real',
 'active_days',
 'main_category_Music',
 'main_category_Games',
 'main_category_Theater',
 'main_category_Technology',
 'category_Video Games',
 'category_Tabletop Games']

#### From Kickstarter Website
"Project updates" refer to what is essentially each project's blog posts. They are an important tool for creators to keep their backers informed; from when the campaign is live - to let backers know a milestone has been reached or to share how preparations are going - to while fulfillment is being completed - for example, a behind the scenes look at production."



In [24]:
# updates_count causing the data leakage --> remove any column that has data from after the project is completed
    # including updates_count and comments_count

df_copy=df_copy.drop(['updates_count', 'comments_count'], axis=1)
X_num_train=X_num_train.drop(['updates_count', 'comments_count'], axis=1)
X_num_test=X_num_test.drop(['updates_count', 'comments_count'], axis=1)

In [25]:
# retry grid search

gs.fit(X_num_train, y_train)
pd.DataFrame(gs.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
mean_fit_time,5.64445,10.088,8.94198,46.8154,6.84636,12.5532,9.45883,93.8894,141.061,174.571,...,9.69832,50.3097,8.26433,13.0023,10.4439,101.638,151.188,190.878,171.146,664.982
std_fit_time,1.92258,1.08138,1.06475,0.957568,0.088294,0.326097,0.323116,0.479925,1.33959,2.21958,...,0.165438,0.194955,0.218834,0.160452,0.188862,0.272382,1.58173,1.73091,1.54766,41.4843
mean_score_time,1.27683,1.15831,1.21064,1.45179,1.28925,1.29888,1.34084,1.51386,15.9645,15.2644,...,1.37848,1.41022,1.47911,1.4354,1.47342,1.52941,16.3512,13.8585,12.8766,3.47033
std_score_time,0.0476016,0.0106477,0.0332888,0.0330864,0.0462032,0.0163676,0.0272987,0.0425261,0.311017,0.895514,...,0.0728389,0.0342387,0.0787742,0.026669,0.0606117,0.047932,0.389091,0.611136,1.42903,0.704897
param_criterion,gini,gini,gini,gini,gini,gini,gini,gini,gini,gini,...,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy,entropy
param_max_depth,1,1,1,1,2,2,2,2,,,...,1,1,2,2,2,2,,,,
param_max_features,1,sqrt,log2,,1,sqrt,log2,,1,sqrt,...,log2,,1,sqrt,log2,,1,sqrt,log2,
params,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': 2, 'max_fea...","{'criterion': 'gini', 'max_depth': None, 'max_...","{'criterion': 'gini', 'max_depth': None, 'max_...",...,"{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 1, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': 2, 'max_...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': None, 'm..."
split0_test_accuracy,0.625509,0.625509,0.625509,0.625509,0.625509,0.625509,0.625509,0.644763,0.656787,0.657581,...,0.625509,0.625509,0.625509,0.625509,0.625509,0.625509,0.657528,0.657933,0.656981,0.663505
split1_test_accuracy,0.62552,0.62552,0.62552,0.62552,0.62552,0.62552,0.62552,0.643416,0.652444,0.654789,...,0.62552,0.62552,0.62552,0.62552,0.62552,0.62552,0.651104,0.653343,0.652585,0.656905


In [26]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=None,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
gs.best_score_

0.6606965811948329

In [28]:
gs.best_params_

{'criterion': 'entropy', 'max_depth': None, 'max_features': None}

In [29]:
hp = {'n_estimators': 50, 'oob_score': True}
hyperparameters = {**gs.best_params_, **hp}
hyperparameters

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'n_estimators': 50,
 'oob_score': True}

In [30]:
model = RandomForestClassifier(**hyperparameters).fit(X_num_train, y_train)

In [31]:
model.oob_score_

0.6610492240312291

In [32]:
importances = model.feature_importances_
indicies = importances.argsort()[::-1]
X_num_train.columns[indicies].tolist()[:10]

['usd_goal_real',
 'active_days',
 'category_Tabletop Games',
 'currency_USD',
 'country_US',
 'main_category_Music',
 'main_category_Theater',
 'country_GB',
 'currency_GBP',
 'category_Hip-Hop']

In [33]:
X_num_train.head()

Unnamed: 0,usd_goal_real,active_days,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
229482,15000.0,60,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
264257,10000.0,29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
72606,6983.24,29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
336540,1500.0,33,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
263391,60000.0,59,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [34]:
# what is important for each individual tree in the forest
# model.estimators is all of your trees

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis = 0)
std

array([4.83612607e-03, 4.64782665e-03, 1.42697483e-04, 2.47477878e-04,
       1.28323284e-04, 6.80742612e-05, 2.69300515e-04, 1.26658158e-04,
       3.99911572e-04, 4.75433758e-04, 1.48792397e-04, 3.85815222e-04,
       2.70650469e-04, 9.13001944e-05, 5.46038471e-05, 8.10137865e-05,
       1.08409048e-04, 1.14310238e-04, 8.21969688e-05, 1.14389067e-04,
       2.92068714e-04, 1.01990428e-04, 4.80164152e-05, 8.28659256e-05,
       2.78381255e-04, 2.05623776e-04, 1.64320255e-04, 2.15397434e-04,
       8.48386194e-05, 1.58267669e-04, 1.08983873e-04, 3.51465394e-04,
       6.91920094e-05, 2.21886539e-04, 3.67710374e-05, 9.88623830e-05,
       1.41273368e-04, 2.95086662e-04, 3.35541497e-04, 1.58068832e-04,
       6.11638075e-04, 2.30770139e-04, 1.49370298e-04, 2.46855681e-04,
       4.78709372e-05, 1.14051402e-04, 1.49971321e-04, 8.61126561e-05,
       1.41587314e-04, 8.03480691e-05, 1.03613839e-04, 8.92587541e-05,
       1.37732678e-04, 3.23075763e-04, 1.34518791e-04, 3.93106559e-04,
      

In [35]:
# plt.bar(X.columns, importances[indicies], color="#33ffcc", yerr=std[indicies], align="center", alpha=0.7, error_kw=dict(ecolor='blue', lw=1, capsize=5, capthick=2))
# plt.ylim([0, 1]);

## Numerical Feature Modeling

In [38]:
# cross validate on training data, 5 folds default, then take mean of fold accuracies

log = cross_val_score(LogisticRegression(max_iter =400), X_num_train, y_train, scoring = 'accuracy').mean()
log

0.6424902126062959

In [None]:
# model.coef_  #show coefficients

# NLP

In [41]:
# preprocessor from helpers.py

vectorizer = TfidfVectorizer(preprocessor=preprocesser, stop_words=stopwords_list())
vectorized_train = vectorizer.fit_transform(X_train.blurb)

In [42]:
# Vectorize Holdout set

vectorized_test = vectorizer.transform(X_test.blurb)

In [43]:
# cross validate on training data, 5 folds default, then take mean of accuracies

log_accuracy = cross_val_score(LogisticRegression(max_iter =400), vectorized_train, y_train, scoring = 'accuracy').mean()
log_accuracy

In [None]:
# random forest

In [None]:
# gradient boosted regressor

In [None]:
# neural net

In [None]:
# fig, ax = plt.subplots(1, figsize=(18,6))
# df = calculate_threshold_values(log_model.predict_proba(X)[:,1], y)
# plot_roc(ax, df)

In [None]:
# roc_auc_score(y_hat, y2_test)

In [None]:
# confusion_matrix(y_hat, y2_test)

In [None]:
# instead of cross_val_score for logistic

# cross_val = LogisticRegressionCV(cv=5, max_iter=300, n_jobs=-1).fit(train2,y2_train)
# cross_val.score(test2, y2_test)

In [None]:
# Future Direction

# Use features from shortly (~1 week) after the project has launched rather than pre-launch, 
# such as staff pick, comment count, backers and pledged instead