In [None]:
# data avaialble on kaggle:
# https://www.kaggle.com/kemical/kickstarter-projects

# following a course on feature engineering by Mat Leonard on kaggle:
# https://www.kaggle.com/learn/feature-engineering


In [3]:
# import modules 
import pandas as pd
import numpy as np

In [7]:
# load part of the dataset
ks_data = pd.read_csv('../data/raw/ks-projects-201801.csv', parse_dates=['deadline','launched'])

In [8]:
# inspect the data 
ks_data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [10]:
# check datatypes and missing values 
ks_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                378661 non-null  int64         
 1   name              378657 non-null  object        
 2   category          378661 non-null  object        
 3   main_category     378661 non-null  object        
 4   currency          378661 non-null  object        
 5   deadline          378661 non-null  datetime64[ns]
 6   goal              378661 non-null  float64       
 7   launched          378661 non-null  datetime64[ns]
 8   pledged           378661 non-null  float64       
 9   state             378661 non-null  object        
 10  backers           378661 non-null  int64         
 11  country           378661 non-null  object        
 12  usd pledged       374864 non-null  float64       
 13  usd_pledged_real  378661 non-null  float64       
 14  usd_

In [30]:
# create a copy of the dataset so that we don't need to reload it every time
ks = ks_data.copy()

## I. Baseline Model

#### Preparing target column

In [31]:
# prepare the classification variable (target column)
ks.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [32]:
# for the model we'll consider 1 -> successful and 0 -> otherwise
# ks['is_successful'] = (ks.state == 'successful').map({False:0, True:1})
# ks['is_successful'] = (ks['state'] == 'successful').astype(int)

# alternative approach using .assign()
ks = ks.assign(is_successful = (ks['state'] == 'successful').astype(int))

In [33]:
# drop live projects
ks = ks[~(ks['state'] == 'live')]

# alternative approach with .query()
# ks = ks.query('state != "live"')

#### Converting timestamps 

In [36]:
# creating new features from the 'launched' timestamp
ks = ks.assign(year = ks['launched'].dt.year,
              month = ks['launched'].dt.month,
              day = ks['launched'].dt.day,
              hour = ks['launched'].dt.hour)
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful,day,year,month,hour
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,11,2015,8,12
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,2,2017,9,4
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,12,2013,1,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,17,2012,3,3
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,4,2015,7,8


#### Processing categorical variables

In [40]:
# specify which columns are categorical
categorical_cols = ['main_category', 'category', 'currency', 'country']

# import encoder
from sklearn.preprocessing import LabelEncoder

# init encoder 
le = LabelEncoder()

In [None]:
# encode (approach 0)
# encoded = le.fit_transform(ks[categorical_cols]) # does not work 

In [42]:
# encode (approach 1) 
# using .apply())
encoded = ks[categorical_cols].apply(le.fit_transform)

# inspect
encoded.head()

Unnamed: 0,main_category,category,currency,country
0,12,108,5,9
1,6,93,13,22
2,6,93,13,22
3,10,90,13,22
4,6,55,13,22


In [44]:
# # encode (approach 2)
# # using a for loop
# for col in categorical_cols:
#     ks[col + '_le'] = le.fit_transform(ks[col])
    
# # inspect
# ks.head()

#### Creating train and test sets

In [47]:
# select all features that the model will use
# interestingly, we're not using the 'launched' timestamp 
data = ks[['goal','year','month','day','hour','is_successful']].join(encoded)
data.head()

Unnamed: 0,goal,year,month,day,hour,is_successful,main_category,category,currency,country
0,1000.0,2015,8,11,12,0,12,108,5,9
1,30000.0,2017,9,2,4,0,6,93,13,22
2,45000.0,2013,1,12,0,0,6,93,13,22
3,5000.0,2012,3,17,3,0,10,90,13,22
4,19500.0,2015,7,4,8,0,6,55,13,22


In [63]:
# import train test split 
from sklearn.model_selection import train_test_split 

# create X and y 
X = data.drop('is_successful', axis=1)
y = data['is_successful']

# split the datasets setting 20% aside for test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
# double check that the target variable is equally represented in train and test datasets
print('% successful in train: {:.2%}'.format(train_y.mean()))
print('% successful in test: {:.2%}'.format(test_y.mean()))

% successful in train: 35.66%
% successful in test: 35.56%


Predictor variable is slightly imbalanced - not between train and test but it's that the % of projects that are successful is less than 50% of all projects. The most common ways to address class imbalance include:
* Up-sample minority class
* Down-sample majority class
* Change the performance metric
* Use a penalized-SVM algorithm
* Use a tree-based algorithm

For this project we'll use a tree-based algorithm and our choice of performance metric is going to be `ROC AUC`.

#### Training a RandomForest Classifier (with default settings)

In [99]:
# import classifier
from sklearn.ensemble import RandomForestClassifier

# init 
rfc = RandomForestClassifier(random_state=34)

# fit the model
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=34, verbose=0,
                       warm_start=False)

#### Making predictions and scoring the model

In [100]:
# predictions 
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)

In [101]:
# import scoring metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# accuracy scores
print('accuracy (train): {:.2%}'.format(accuracy_score(y_train, y_pred_train)))
print('accuracy (test): {:.2%}'.format(accuracy_score(y_test, y_pred)))

# roc auc scores 
print('roc auc (train): {:.2%}'.format(roc_auc_score(y_train, y_pred_train)))
print('roc auc (test): {:.2%}'.format(roc_auc_score(y_test, y_pred)))

accuracy (train): 99.89%
accuracy (test): 68.64%
roc auc (train): 99.87%
roc auc (test): 62.76%


The default model is **overfitting** way too much but I'm not sure how to handle this other than manually changing the `max_depth` or `min_samples_leaf` parameters. `GridSearchCV` is not helpful here because it optimises parameters based on the accuracy score of the traning dataset which in our case is already very high at `>99%`.


In [98]:
# # plot a confusion matrix
# import seaborn as sns 

# # create confusion matrix 
# cm = confusion_matrix(y_test, y_pred)
# cm

# # turn it into a nice DataFrame to plot it easier 
# cm_df = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
# cm_df
# cm_df.index.name = 'Actual'
# cm_df.columns.name = 'Predicted'

# # this will be used as the max value for the heatmap legend 
# calculated_vmax = (int(cm_df.max().max()/100)+1)*100

# # create heatmap 
# sns.heatmap(
#     cm_df, 
#     cmap=sns.light_palette((250, 80, 60), input='husl', n_colors=6),
# #     vmin=0,
#     vmax=calculated_vmax,
#     annot=True, fmt=',d')

## 2. Categorical Encodings