# Model training and evaluation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import tree
import graphviz

# multiclass classifiers
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('data/static_01.csv', index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30702 entries, 0 to 30701
Columns: 5524 entries, category_position to 5514
dtypes: float64(5516), int64(8)
memory usage: 1.3 GB


In [4]:
df.sample(5)

Unnamed: 0,category_position,days_duration,days_preparation,db_duration,goal,usd_goal,class,db_image,db_video,0,...,5505,5506,5507,5508,5509,5510,5511,5512,5513,5514
22983,3,34,8,35,7500.0,7440,3,1,1,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23526,2,55,2,55,500.0,500,4,1,1,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29880,1,30,81,30,2000.0,2000,4,1,1,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10633,11,25,3,25,1100.0,1586,1,1,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2918,1,30,0,30,5000.0,5000,1,1,0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.30, random_state=16)

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21491 entries, 3764 to 6825
Columns: 5523 entries, category_position to 5514
dtypes: float64(5516), int64(7)
memory usage: 905.7 MB


In [13]:
X_train.head()

Unnamed: 0,category_position,days_duration,days_preparation,db_duration,goal,usd_goal,db_image,db_video,0,1,...,5505,5506,5507,5508,5509,5510,5511,5512,5513,5514
3764,12,30,54,30,20000.0,20000,1,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24880,9,60,13,60,350000.0,18661,0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3294,15,30,1,30,5000.0,5000,1,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22737,2,30,63,30,35000.0,35000,1,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15138,9,60,1,60,9500.0,9500,1,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 1. LinearSVC

In [14]:
cls = LinearSVC()
cls = cls.fit(X_train, y_train)

In [15]:
confusion_matrix(y_test, cls.predict(X_test))

array([[ 127,   11,    1, 3750],
       [  82,   10,    0, 1081],
       [   2,    0,    2, 2076],
       [   8,    0,    1, 2060]], dtype=int64)

In [16]:
cls.score(X_train, y_train)

0.24112419152203249

In [17]:
cls.score(X_test, y_test)

0.23873629356204537

## 2. DecisionTreeClassifier

In [114]:
cls = DecisionTreeClassifier(max_depth=7, class_weight={1: 1, 2:3.239, 3:1.836, 4:1.874})

In [115]:
cls = cls.fit(X_train, y_train)

In [116]:
confusion_matrix(y_test, cls.predict(X_test))

array([[2467, 1385,    8,   29],
       [ 295,  861,    2,   15],
       [ 146,  146, 1267,  521],
       [ 100,  199,  642, 1128]], dtype=int64)

In [117]:
cls.score(X_train, y_train)

0.63017076915918291

In [118]:
cls.score(X_test, y_test)

0.6213223319943546

In [119]:
cls.feature_importances_

array([ 0.00720877,  0.0065728 ,  0.03262953, ...,  0.        ,
        0.        ,  0.        ])

In [121]:
dot_data = tree.export_graphviz(cls, out_file=None, feature_names=X_train.columns, class_names=['1','2','3','4'],filled=True, rounded=True, special_characters=True) 
graph = graphviz.Source(dot_data)
graph.render("kickstarter3")

'kickstarter3.pdf'