In [11]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import glob
import os
import matplotlib.pyplot as plt

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(1557, 563)
(293, 563)


In [3]:
features_to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'pledged', 'category', 'location']

#features that are dependent on time and the final outcome
to_drop_more = features_to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = train_df.drop(to_drop_more, axis=1), train_df['state']
X_test, y_test = test_df.drop(to_drop_more, axis=1), test_df['state']


In [4]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [5]:
# Pretty balanced dataset
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(784, 563)
(773, 563)
(1557, 541)
(1557,)
(293, 541)
(293,)


In [6]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

**MODEL**

In [10]:
# Random forest model 
# training
regressor = RandomForestClassifier(n_estimators=100)
regressor.fit(X_train, y_train)
# testing
predictions = regressor.predict(X_test)

**Evaluate Model Performance**

In [15]:
# test accuracy
accuracy_score(y_test, predictions)

0.7952218430034129

In [16]:
# test auroc 
roc_auc_score(y_test, predictions)

0.7943465697240865

**Hyperparameter Tuning**

We will use GridSearchCV for hyperparameter tuning.
