In [12]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(1557, 563)
(293, 563)


In [3]:
features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'pledged', 'category', 'location']

#features that are dependent on time and the final outcome
to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop(to_drop_more, axis=1), train_df['state']
X_test, y_test = test_df.drop(to_drop_more, axis=1), test_df['state']


In [4]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [5]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(784, 563)
(773, 563)
(1557, 541)
(1557,)
(293, 541)
(293,)


In [6]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

**Random Forest model training and testing**

In [7]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)
regressor.fit(X_train, y_train)
# testing
y_pred_test = regressor.predict(X_test)

**Evaluate Model Performance**

In [8]:
# test accuracy
accuracy_score(y_test, y_pred_test)

0.8156996587030717

In [9]:
# test auroc 
roc_auc_score(y_test, y_pred_test)

0.8145973154362416

**Hyperparameter Tuning**

We will use RandomizedSearchCV to narrow down the range for grid search hyperparameter tuning.


In [11]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [17]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)

rf_randomcv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=grid,
    n_iter=100, 
    cv=5, # k-fold cv
    verbose=2,
    random_state=2022,
    n_jobs=-1 # use all processors
)

rf_randomcv.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=2022, verbose=2)

In [18]:
rf_randomcv.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}

In [20]:
rf_randomcv.best_score_

0.8342814741528567

**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [22]:

n_estimators = [50, 100, 150, 200, 250, 300, 350, 400]
max_features = ['sqrt']
max_depth = [80, 90, 100, 110, 120, 130, 140]
min_samples_split = [3,4,5,6,7,8]
min_samples_leaf = [1, 2, 3]
bootstrap = [False]

narrow_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(narrow_grid)

{'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400], 'max_features': ['sqrt'], 'max_depth': [80, 90, 100, 110, 120, 130, 140], 'min_samples_split': [3, 4, 5, 6, 7, 8], 'min_samples_leaf': [1, 2, 3], 'bootstrap': [False]}


In [27]:
regressor = RandomForestClassifier()
rf_gridcv = GridSearchCV(
    estimator=regressor,
    param_grid=narrow_grid,
    cv=5, # k-fold cv
    verbose=2,
    n_jobs=-1 # use all processors
)

rf_gridcv.fit(X_train, y_train)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False],
                         'max_depth': [80, 90, 100, 110, 120, 130, 140],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [3, 4, 5, 6, 7, 8],
                         'n_estimators': [50, 100, 150, 200, 250, 300, 350,
                                          400]},
             verbose=2)

In [28]:
rf_gridcv.best_params_

{'bootstrap': False,
 'max_depth': 110,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 100}

In [29]:
rf_gridcv.best_score_

0.8394117404567567

**Use tuned classifier on test data**

In [33]:
best_clf = rf_gridcv.best_estimator_

y_pred_test = best_clf.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.8122866894197952

In [34]:
roc_auc_score(y_test, y_pred_test)

0.8111250932140194