# Arboles de decision

In [49]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_val_score
import os

In [50]:
d = pd.read_csv('student-por.csv', sep=';')

In [51]:
d['pass'] = d.apply(lambda row: 1 if(row['G1'] + row['G2'] + row['G3']) >= 35 else 0, axis=1)
d = d.drop(['G1', 'G2', 'G3'], axis=1)
d.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,6,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,1


In [52]:
d = pd.get_dummies(d, columns=['sex',
                               'school',
                               'address',
                               'famsize',
                               'Pstatus',
                               'Mjob',
                               'Fjob',
                               'reason',
                               'guardian',
                               'schoolsup',
                               'famsup',
                               'paid',
                               'activities',
                               'nursery',
                               'higher',
                               'internet',
                               'romantic'])
d.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1,0,0,1,0,1,1,0,1,0
1,17,1,1,1,2,0,5,3,3,1,...,1,0,1,0,0,1,0,1,1,0
2,15,1,1,1,2,0,4,3,2,2,...,1,0,0,1,0,1,0,1,1,0
3,15,4,2,1,3,0,3,2,2,1,...,0,1,0,1,0,1,0,1,0,1
4,16,3,3,1,2,0,4,3,2,1,...,1,0,0,1,0,1,1,0,1,0


## Mezclando los datos
##### 649 500, 149

In [53]:
d = d.sample(frac=1)
d_train = d[:500]
d_test = d[500:]

d_train_att = d_train.drop(['pass'], axis=1)
d_train_pass = d_train['pass']

d_test_att = d_test.drop(['pass'], axis=1)
d_test_pass = d_test['pass']

d_att = d.drop(['pass'], axis=1)
d_pass = d['pass']

passing = np.sum(d_pass)
total = len(d_pass)
percn = passing / total * 100
print(f'Passing: {passing} out of {total} ({percn}%)')

Passing: 328 out of 649 (50.53929121725732%)


In [54]:
t = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
t.fit(d_train_att, d_train_pass)

tree.export_graphviz(t, out_file='student-performance.dot', label='all', impurity=False, proportion=True, feature_names=list(d_train_att), class_names=['fail', 'pass'], filled=True, rounded=True)

In [55]:
os.system("dot -Tpng student-performance.dot -o Cname.png")

0

In [56]:
print(t.score(d_test_att, d_test_pass))

0.6577181208053692


## Implementando 5-fold validation

In [57]:
scores = cross_val_score(t, d_att, d_pass, cv=5)

mean = scores.mean()
std = scores.std()

print(f'mean:{mean}, std:{std}')

mean:0.6655029200635451, std:0.03306352722717985


In [58]:
for max_depth in range(1, 20):
    t = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    scores = cross_val_score(t, d_att, d_pass)
    mean = scores.mean()
    std = scores.std() * 2
    print(f'max_depth: {max_depth}, Accuracy:{mean}, (+/- {std})')



max_depth: 1, Accuracy:0.6379146043124537, (+/- 0.010041632276802761)
max_depth: 2, Accuracy:0.687226204699323, (+/- 0.017915061580203697)
max_depth: 3, Accuracy:0.6949066962507823, (+/- 0.01379484016550489)
max_depth: 4, Accuracy:0.6948782499857767, (+/- 0.04305430779195679)
max_depth: 5, Accuracy:0.6856189907265176, (+/- 0.050711716787034275)
max_depth: 6, Accuracy:0.6933563748079877, (+/- 0.01699017326196015)
max_depth: 7, Accuracy:0.6733088695454287, (+/- 0.04945029523283141)
max_depth: 8, Accuracy:0.6733230926779314, (+/- 0.02028841983252969)
max_depth: 9, Accuracy:0.6394720373214997, (+/- 0.0723313820014516)




max_depth: 10, Accuracy:0.6502247254935427, (+/- 0.016221007156926433)
max_depth: 11, Accuracy:0.6441087785173806, (+/- 0.04826306024790045)
max_depth: 12, Accuracy:0.6471738635717131, (+/- 0.02271429362290017)
max_depth: 13, Accuracy:0.622489617113273, (+/- 0.00964808661921526)
max_depth: 14, Accuracy:0.6379146043124537, (+/- 0.021065846930594573)
max_depth: 15, Accuracy:0.6379288274449565, (+/- 0.040709422915616)
max_depth: 16, Accuracy:0.6193960857939352, (+/- 0.01730408770016718)
max_depth: 17, Accuracy:0.6301914433634864, (+/- 0.013957104194693351)
max_depth: 18, Accuracy:0.6302198896284917, (+/- 0.02125102592330795)
max_depth: 19, Accuracy:0.6502531717585481, (+/- 0.030395196889251158)


