In [8]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [10]:
train_path = max(glob.glob('../data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('../data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print(train_df.shape, train_path)
print(test_df.shape, test_path)

(24756, 579) ../data/train\kickstarter_train_final_20221108-144153.csv
(4369, 579) ../data/test\kickstarter_test_final_20221108-144158.csv


In [11]:
# features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
#            'description_processed', 'description_story_processed','description_risks_processed',
#            'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
#           'pledged', 'category', 'location']

# #features that are dependent on time and the final outcome
# to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']


In [12]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [13]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17482, 579)
(7274, 579)
(24756, 578)
(24756,)
(4369, 578)
(4369,)


In [14]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

**Random Forest model training and testing**

In [7]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100, max_depth=5)
regressor.fit(X_train, y_train)
# apply model
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

**Evaluate Model Performance**

In [8]:
# train
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.84      0.28      0.42      7274
           1       0.76      0.98      0.86     17482

    accuracy                           0.77     24756
   macro avg       0.80      0.63      0.64     24756
weighted avg       0.79      0.77      0.73     24756



In [9]:
# test
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.85      0.26      0.40      1282
           1       0.76      0.98      0.86      3087

    accuracy                           0.77      4369
   macro avg       0.80      0.62      0.63      4369
weighted avg       0.79      0.77      0.72      4369



In [10]:
regressor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

**Hyperparameter Tuning**

We will use RandomizedSearchCV for hyperparameter tuning.


In [11]:
# Number of trees in random forest
n_estimators = [50, 100, 150, 200]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [3,5,8, 10]
# Minimum number of samples required to split a node
min_samples_split = [3, 5, 8, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [ 2, 4, 6]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid)

{'n_estimators': [50, 100, 150, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 8, 10], 'min_samples_split': [3, 5, 8, 10], 'min_samples_leaf': [2, 4, 6], 'bootstrap': [True, False]}


In [12]:
# Random forest model 
# training
regressor = RandomForestClassifier()

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=2022)

rf_randomcv = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=grid,
    n_iter=100, 
    cv=cv_method,
    verbose=2,
    random_state=2022,
    n_jobs=-1, # use all processors,
    scoring='roc_auc'
)

rf_randomcv.fit(X_train, y_train)


Fitting 15 folds for each of 100 candidates, totalling 1500 fits


RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=2022),
                   estimator=RandomForestClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 5, 8, 10],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 4, 6],
                                        'min_samples_split': [3, 5, 8, 10],
                                        'n_estimators': [50, 100, 150, 200]},
                   random_state=2022, scoring='roc_auc', verbose=2)

In [17]:
best_params_ = {'n_estimators': 150,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [14]:
rf_randomcv.best_score_

0.873366168258555

**Use tuned classifier on test data**

In [20]:
import time

# best_clf = RandomForestClassifier(**rf_randomcv.best_params_)
best_clf = RandomForestClassifier(**best_params_)

start_time = time.time()

best_clf.fit(X_train, y_train)
y_pred_test = best_clf.predict(X_test)
y_pred_test_proba = best_clf.predict_proba(X_test)[:,1]

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
roc_auc_score(y_test, y_pred_test_proba)

Total time taken for the program execution 39.6793999671936
              precision    recall  f1-score   support

           0    0.83727   0.46958   0.60170      1282
           1    0.81370   0.96210   0.88170      3087

    accuracy                        0.81758      4369
   macro avg    0.82549   0.71584   0.74170      4369
weighted avg    0.82062   0.81758   0.79954      4369



0.8840851398876168

In [16]:
importances = best_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_clf.estimators_], axis = 0)

indices = np.argsort(importances)[::-1]

print('Feature Ranking:')

for f in range(X.shape[1]):
	print('%d. features %d (%f)'% (f+1, indices[f], importances[indices[f]]))

Feature Ranking:
1. features 421 (0.104366)
2. features 4 (0.081595)
3. features 0 (0.069119)
4. features 3 (0.043552)
5. features 414 (0.040231)
6. features 418 (0.028688)
7. features 247 (0.026502)
8. features 5 (0.022260)
9. features 61 (0.020921)
10. features 416 (0.020791)
11. features 417 (0.018283)
12. features 274 (0.016516)
13. features 2 (0.016187)
14. features 219 (0.013324)
15. features 211 (0.012084)
16. features 71 (0.011113)
17. features 102 (0.011077)
18. features 300 (0.009621)
19. features 1 (0.009318)
20. features 47 (0.009218)
21. features 30 (0.008773)
22. features 301 (0.008680)
23. features 60 (0.008204)
24. features 244 (0.008041)
25. features 305 (0.007711)
26. features 81 (0.007339)
27. features 95 (0.007157)
28. features 105 (0.006510)
29. features 262 (0.006418)
30. features 293 (0.005987)
31. features 10 (0.005671)
32. features 66 (0.005602)
33. features 353 (0.005534)
34. features 215 (0.005093)
35. features 212 (0.005021)
36. features 76 (0.005000)
37. fe