In [61]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [62]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(24756, 576)
(4369, 576)


In [63]:
# features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
#            'description_processed', 'description_story_processed','description_risks_processed',
#            'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
#           'pledged', 'category', 'location']

# #features that are dependent on time and the final outcome
# to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']


In [64]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [65]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17482, 576)
(7274, 576)
(24756, 575)
(24756,)
(4369, 575)
(4369,)


In [66]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

**Random Forest model training and testing**

In [67]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100, max_depth=5)
regressor.fit(X_train, y_train)
# apply model
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

**Evaluate Model Performance**

In [68]:
# train
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.67      0.77      7274
           1       0.88      0.97      0.92     17482

    accuracy                           0.88     24756
   macro avg       0.89      0.82      0.85     24756
weighted avg       0.89      0.88      0.88     24756



In [69]:
# test
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.67      0.77      1282
           1       0.88      0.97      0.92      3087

    accuracy                           0.88      4369
   macro avg       0.89      0.82      0.84      4369
weighted avg       0.88      0.88      0.88      4369



In [70]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

**Hyperparameter Tuning**

We will use RandomizedSearchCV for hyperparameter tuning.


In [75]:
# Number of trees in random forest
n_estimators = [50, 100, 150]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [3,5,8]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid)

{'n_estimators': [50, 100, 150], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 8], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [76]:
# Random forest model 
# training
regressor = RandomForestClassifier()

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=2022)

rf_randomcv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=grid,
    n_iter=100, 
    cv=cv_method,
    verbose=2,
    random_state=2022,
    n_jobs=-1, # use all processors,
    scoring='roc_auc'
)

rf_randomcv.fit(X_train, y_train)


Fitting 15 folds for each of 100 candidates, totalling 1500 fits


RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=2022),
                   estimator=RandomForestClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 5, 8],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 150]},
                   random_state=2022, scoring='roc_auc', verbose=2)

In [77]:
rf_randomcv.best_params_

{'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': False}

In [78]:
rf_randomcv.best_score_

0.9321799731140534

**Use tuned classifier on test data**

In [79]:
import time

best_clf = RandomForestClassifier(**rf_randomcv.best_params_)

start_time = time.time()

best_clf.fit(X_train, y_train)
y_pred_test = best_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
roc_auc_score(y_test, y_pred_test)

Total time taken for the program execution 22.117509126663208
              precision    recall  f1-score   support

           0    0.89881   0.70671   0.79127      1282
           1    0.88813   0.96696   0.92587      3087

    accuracy                        0.89059      4369
   macro avg    0.89347   0.83683   0.85857      4369
weighted avg    0.89126   0.89059   0.88637      4369



0.8368332400934522