In [12]:
import pandas as pd
import numpy as np
from sklearn.tree import export_graphviz
import pydot
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
df = pd.read_csv("KS_train_final.csv")

In [13]:
df = df.drop('blurb_bayes', axis = 1)
X = df.drop('funded', axis=1)
y = df['funded']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rfc = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features='sqrt', random_state = 42)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

In [15]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[ 6533  3780]
 [ 2164 11432]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.75      0.63      0.69     10313
         1.0       0.75      0.84      0.79     13596

    accuracy                           0.75     23909
   macro avg       0.75      0.74      0.74     23909
weighted avg       0.75      0.75      0.75     23909



=== All AUC Scores ===
[0.67170602 0.82320859 0.77473436 0.83635985 0.93405107 0.76545504
 0.74859925 0.76772514 0.77525375 0.78829457]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7885387628041547


In [16]:
metrics.accuracy_score(y_test, rfc_predict)

0.7513906896984399

In [17]:
metrics.accuracy_score(y_train, rfc.predict(X_train))

0.7582572324851865

In [18]:
importances = list(rfc.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: staff_pick           Importance: 0.21
Variable: log_goal_usd         Importance: 0.2
Variable: log_days_to_launch   Importance: 0.08
Variable: is_2015              Importance: 0.07
Variable: is_2012              Importance: 0.06
Variable: is_2011              Importance: 0.05
Variable: is_2013              Importance: 0.05
Variable: is_2014              Importance: 0.04
Variable: log_days_to_dealine  Importance: 0.04
Variable: is_cat_technology    Importance: 0.03
Variable: is_cat_art           Importance: 0.02
Variable: is_cat_food          Importance: 0.02
Variable: is_2010              Importance: 0.02
Variable: is_cat_music         Importance: 0.01
Variable: is_cat_publishing    Importance: 0.01
Variable: is_cat_games         Importance: 0.01
Variable: is_cat_comics        Importance: 0.01
Variable: is_cat_other         Importance: 0.01
Variable: is_country_us        Importance: 0.01
Variable: is_loc_ny            Importance: 0.01
Variable: is_2016              Importance