# Decision Trees vs Random Forests
---

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn import tree
%matplotlib inline

In [2]:
df = pd.read_csv('./data/ks-projects-201801.csv')

print(f'Shape: {df.shape}')
df.head()

Shape: (378661, 15)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [6]:
# df['state'] -> predict

In [7]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

name
375764
category
159
main_category
15
currency
14
deadline
3164
launched
378089
state
6
country
23


In [8]:
# Drop other columns with many unique variables
df.drop(['category', 'name', 'deadline', 'launched'], 1, inplace=True)

In [9]:
X = pd.get_dummies(df)

In [11]:
X.head()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,main_category_Art,main_category_Comics,main_category_Crafts,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000002330,1000.0,0.0,0,0.0,0.0,1533.95,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000003930,30000.0,2421.0,15,100.0,2421.0,30000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1000004038,45000.0,220.0,3,220.0,220.0,45000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1000007540,5000.0,1.0,1,1.0,1.0,5000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1000011046,19500.0,1283.0,14,1283.0,1283.0,19500.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
rfc = ensemble.RandomForestClassifier(n_estimators=10)
X = df.drop('state', 1)
Y = df['state']
X = pd.get_dummies(X)
X = X.dropna(axis=1)
X.shape

(378661, 58)

In [28]:
rf_acc = cross_val_score(rfc, X, Y, cv=10)

In [30]:
rf_acc

array([0.85460019, 0.85457378, 0.85018618, 0.85013336, 0.85060871,
       0.84770381, 0.85461508, 0.85374356, 0.85102337, 0.85452433])

In [29]:
np.average(rf_acc)

0.8521712367277136

# Decision Tree
---

In [24]:
# Initialize and train our tree.
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=5,
    max_depth=8
)

In [25]:
dt_acc = cross_val_score(decision_tree, X, Y, cv=10)

In [26]:
dt_acc

array([0.82565755, 0.82042886, 0.75979085, 0.75736129, 0.73465022,
       0.74568886, 0.76833487, 0.74174039, 0.72320085, 0.72872537])

In [27]:
np.average(dt_acc)

0.7605579108265113

## Decision Tree with no constraints

In [31]:
# Initialize and train our tree.
decision_tree_2 = tree.DecisionTreeClassifier(
    criterion='entropy'
)

In [32]:
dt_acc_2 = cross_val_score(decision_tree_2, X, Y, cv=10)

In [33]:
dt_acc_2

array([0.80379212, 0.81456639, 0.79742256, 0.80288906, 0.80059154,
       0.76795099, 0.80657599, 0.79271095, 0.80242968, 0.81604247])

In [34]:
np.average(dt_acc_2)

0.8004971750788057

Random forests not only provided a higher accuracy, but also a lot less variance.