In [10]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(24756, 576)
(4369, 576)


In [4]:
# features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
#            'description_processed', 'description_story_processed','description_risks_processed',
#            'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
#           'pledged', 'category', 'location']

# #features that are dependent on time and the final outcome
# to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']


In [5]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [6]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17482, 576)
(7274, 576)
(24756, 575)
(24756,)
(4369, 575)
(4369,)


In [7]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

**Random Forest model training and testing**

In [20]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100, max_depth=5)
regressor.fit(X_train, y_train)
# apply model
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

**Evaluate Model Performance**

In [21]:
# train
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.66      0.76      7274
           1       0.87      0.97      0.92     17482

    accuracy                           0.88     24756
   macro avg       0.89      0.81      0.84     24756
weighted avg       0.88      0.88      0.87     24756



In [22]:
# test
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.91      0.65      0.76      1282
           1       0.87      0.97      0.92      3087

    accuracy                           0.88      4369
   macro avg       0.89      0.81      0.84      4369
weighted avg       0.88      0.88      0.87      4369



In [23]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

**Hyperparameter Tuning**

We will use RandomizedSearchCV to narrow down the range for grid search hyperparameter tuning.


In [24]:
# Number of trees in random forest
n_estimators = [50, 100, 150]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [3,5,8,10]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid)

{'n_estimators': [50, 100, 150], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 8, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [25]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)

rf_randomcv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=grid,
    n_iter=100, 
    cv=3, # k-fold cv
    verbose=2,
    random_state=2022,
    n_jobs=-1 # use all processors
)

rf_randomcv.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 5, 8, 10],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 150]},
                   random_state=2022, verbose=2)

In [26]:
rf_randomcv.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [27]:
rf_randomcv.best_score_

0.8932380029083858

**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.


In [28]:

n_estimators = [50, 100, 150]
max_features = ['sqrt']
max_depth = [5,10,15]
min_samples_split = [2,4,6]
min_samples_leaf = [1, 2, 3]
bootstrap = [False]

narrow_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(narrow_grid)

{'n_estimators': [50, 100, 150], 'max_features': ['sqrt'], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 3], 'bootstrap': [False]}


In [29]:
regressor = RandomForestClassifier()
rf_gridcv = GridSearchCV(
    estimator=regressor,
    param_grid=narrow_grid,
    cv=3, # k-fold cv
    verbose=2,
    n_jobs=-1 # use all processors
)

rf_gridcv.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'max_depth': [5, 10, 15],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [50, 100, 150]},
             verbose=2)

In [30]:
rf_gridcv.best_params_

{'bootstrap': False,
 'max_depth': 15,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 100}

In [31]:
rf_gridcv.best_score_

0.8976409759250282

**Use tuned classifier on test data**

In [35]:
import time

best_clf = RandomForestClassifier(
    bootstrap=False,
    max_depth=15,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=4,
    n_estimators=100
    )

start_time = time.time()

best_clf.fit(X_train, y_train)
y_pred_test = best_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
roc_auc_score(y_test, y_pred_test)

Total time taken for the program execution 28.423868894577026
              precision    recall  f1-score   support

           0    0.89533   0.74727   0.81463      1282
           1    0.90179   0.96372   0.93173      3087

    accuracy                        0.90021      4369
   macro avg    0.89856   0.85549   0.87318      4369
weighted avg    0.89989   0.90021   0.89736      4369



0.8554943558286551

In [34]:
roc_auc_score(y_test, y_pred_test)

0.8111250932140194