In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

### Feature Engineering

In [6]:
df = pd.read_csv('../data/no_cvec_kickstarter.csv')

In [28]:
df.drop_duplicates(['id'],inplace = True)

In [45]:
df

Unnamed: 0,id,name,launched,deadline,pledged,usd_pledged,goal,backers,result,currency_AUD,...,duration_83,duration_84,duration_85,duration_86,duration_87,duration_88,duration_89,duration_90,duration_91,duration_97
0,1726200831,the suck less job search take the suck out of ...,2016-02-22 03:11:45,2016-03-23 03:11:45,4227.0,4227.00,3000.0,53,1,0,...,0,0,0,0,0,0,0,0,0,0
1,268594121,gertrude s clothes gertrude s clothes is a col...,2015-01-27 21:58:20,2015-02-26 21:58:20,555.0,445.88,500.0,13,1,0,...,0,0,0,0,0,0,0,0,0,0
2,662057057,mars attacks uprising trading cards the all n...,2020-02-03 08:59:31,2020-03-03 23:00:00,169760.0,169760.00,25000.0,526,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1135037498,conquer granola power packed free from suga...,2017-01-09 20:58:09,2017-02-08 20:58:09,2371.0,2371.00,2000.0,29,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1006746766,quevos the original egg white chips quevos ...,2019-04-04 11:20:10,2019-05-04 11:20:10,71786.0,71786.00,10000.0,1585,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412669,99996661,date seat boston ios app,2016-07-29 15:36:29,2016-09-05 00:00:00,0.0,0.00,6000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
412670,999972264,islanda,2012-02-15 04:31:10,2012-03-16 00:00:00,25.0,25.00,1700.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
412671,999977640,the tribe,2011-06-22 03:35:14,2011-07-19 00:00:00,155.0,155.00,1500.0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
412672,999986353,walls of remedy new lesbian romantic comedy f...,2010-07-01 19:40:30,2010-08-16 00:00:00,20.0,20.00,15000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Logistic Regression

In [29]:
X = df.drop(columns = ['result','pledged','usd_pledged','name','launched','deadline','id'])

In [30]:
y = df['result']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [32]:
lr = LogisticRegression(max_iter = 1500)

In [33]:
%%time
lr.fit(X_train,y_train)

CPU times: user 27.1 s, sys: 432 ms, total: 27.6 s
Wall time: 7.52 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
lr.score(X_train,y_train)

0.9183224328724784

In [35]:
lr.score(X_test,y_test)

0.9164364140614489

In [36]:
%%time
cross_val_score(lr,X_train,y_train)

CPU times: user 1min 37s, sys: 2.34 s, total: 1min 39s
Wall time: 28.2 s


array([0.91829941, 0.91767709, 0.91281326, 0.86339882, 0.86131185])

In [46]:
np.mean([0.91829941, 0.91767709, 0.91281326, 0.86339882, 0.86131185])

0.8947000860000001

In [37]:
preds = lr.predict(X_test)

In [38]:
metrics.accuracy_score(y_test,preds)

0.9164364140614489

## Random Forest

In [49]:
rf = RandomForestClassifier(max_depth = 25,random_state = 42,n_estimators = 40)

In [50]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [51]:
rf.score(X_train,y_train)

0.9664017440737368

In [52]:
rf.score(X_test,y_test)

0.9569505241480654

In [53]:
rf_preds = rf.predict(X_test)

In [54]:
metrics.accuracy_score(y_test,rf_preds)

0.9569505241480654

In [48]:
%%time
cross_val_score(rf,X_train,y_train).mean()

CPU times: user 2min 54s, sys: 1.52 s, total: 2min 55s
Wall time: 2min 56s


0.9572536322505846

In [55]:
rf.predict_proba(X_test)

array([[0.53893431, 0.46106569],
       [0.70656723, 0.29343277],
       [0.70877916, 0.29122084],
       ...,
       [0.24721226, 0.75278774],
       [0.95039095, 0.04960905],
       [0.40129888, 0.59870112]])