# Random Forest Model

The purpose of this notebook is to train a random forest model on to predict if a donors choose project will be funded or not.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('bmh')

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score,f1_score
from sklearn.preprocessing import StandardScaler
import pickle

Load engineered features and text data

In [2]:
with open('Data/main_df.pkl', 'rb') as f:
    main_df = pickle.load(f)

In [3]:
with open('Data/word_freqs_titles.pkl', 'rb') as f:
    word_freqs_titles = pickle.load(f)

In [4]:
with open('Data/word_freqs_essays.pkl', 'rb') as f:
    word_freqs_essays = pickle.load(f)

In [5]:
with open('Data/word_freqs_needs.pkl', 'rb') as f:
    word_freqs_needs = pickle.load(f)

Combine text and non text features

In [6]:
from scipy.sparse import hstack
use_in_models = hstack((word_freqs_titles,main_df.drop(['Project ID', 'School ID', 'Teacher ID','Funded?'],axis='columns').values))

use_in_models=hstack((use_in_models,word_freqs_essays))

use_in_models=hstack((use_in_models,word_freqs_needs))

Split data into train, validation, and test sets

In [7]:
X_train_whole, X_test, y_train_whole, y_test = train_test_split(use_in_models,main_df['Funded?'],
                                                  test_size=0.2,random_state=42)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_train_whole,y_train_whole,
                                                  test_size=0.2,random_state=42)

### Random Forest

The following code trains a random forest classifier and cross validates to find best parameters

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(class_weight={1:3.67,0:1},n_jobs=-1,n_estimators=300, max_depth=12)
clf.fit(X_train,y_train)
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [10,12,14,16],
#     'max_features': [3,4],
#     'min_samples_leaf': [2,3],
#     'min_samples_split': [6,8],
#     'n_estimators': [20]
# }

# CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
# CV_rfc.fit(X_train, y_train)
# print(CV_rfc.best_params_)

RandomForestClassifier(bootstrap=True, class_weight={1: 3.67, 0: 1},
            criterion='gini', max_depth=12, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [23]:
y_pred_rf = clf.predict(X_val)
y_train_rf = clf.predict(X_train)

In [24]:
print("Train RF Accuracy: "+str(accuracy_score(y_train, y_train_rf)))
print("Train RF Recall: "+str(recall_score(y_train, y_train_rf)))
print("Train RF Precision: "+str(precision_score(y_train, y_train_rf)))
print("Train RF F1: "+str(f1_score(y_train, y_train_rf)))

print("Val RF Accuracy: "+str(accuracy_score(y_val, y_pred_rf)))
print("Val RF Recall: "+str(recall_score(y_val, y_pred_rf)))
print("Val RF Precision: "+str(precision_score(y_val, y_pred_rf)))
print("Val RF F1: "+str(f1_score(y_val, y_pred_rf)))

Train RF Accuracy: 0.6437486103472084
Train RF Recall: 0.7575339448590381
Train RF Precision: 0.36198197194259746
Train RF F1: 0.4898789325960371
Val RF Accuracy: 0.6309162292942946
Val RF Recall: 0.7289518119236265
Val RF Precision: 0.34769409957128344
Val RF F1: 0.4708181978339109
