## Experimento: Arbol de Decision Titanic

In [1]:
#Empezamos igual que en el caso de regresion logistica.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import sklearn

from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
#una diferencia es que importamos tree en ves de regresion logistica
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

titanic = pd.read_csv("datos/titanic-train.csv")
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
titanic_data = titanic.drop(['PassengerId','Name','Ticket','Cabin'], 1)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
def age_approx(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [6]:
#Al igual que en regresion logistica, se convierten todos los valores posibles a valores numericos.
titanic_data['Age'] = titanic_data[['Age', 'Pclass']].apply(age_approx, axis=1)
titanic_data.dropna(inplace=True)
gender = pd.get_dummies(titanic_data['Sex'],drop_first=True)
embark_location = pd.get_dummies(titanic_data['Embarked'],drop_first=False)
titanic_data.drop(['Sex', 'Embarked'],axis=1,inplace=True)
titanic_dmy = pd.concat([titanic_data,gender,embark_location],axis=1)
titanic_dmy.drop(['Pclass'],axis=1,inplace=True)
titanic_dmy

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,C,Q,S
0,0,22.0,1,0,7.2500,1,0,0,1
1,1,38.0,1,0,71.2833,0,1,0,0
2,1,26.0,0,0,7.9250,0,0,0,1
3,1,35.0,1,0,53.1000,0,0,0,1
4,0,35.0,0,0,8.0500,1,0,0,1
5,0,24.0,0,0,8.4583,1,0,1,0
6,0,54.0,0,0,51.8625,1,0,0,1
7,0,2.0,3,1,21.0750,1,0,0,1
8,1,27.0,0,2,11.1333,0,0,0,1
9,1,14.0,1,0,30.0708,0,1,0,0


**Tenemos el dataset procesado, ahora apliquemos el algoritmos de arbol de decision:**

In [7]:
X = titanic_dmy.ix[:,(1,2,3,4,5,6,7)].values
y = titanic_dmy.ix[:,0].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)
print(X_test)

[[33.  0.  0. ...  0.  0.  0.]
 [28.  1.  0. ...  0.  1.  0.]
 [25.  1.  0. ...  1.  0.  0.]
 ...
 [41.  0.  1. ...  0.  0.  0.]
 [28.  0.  1. ...  1.  0.  0.]
 [18.  0.  0. ...  0.  0.  0.]]


In [9]:
#Entrenamos al arbol.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [10]:
#Evaluamos el rendimiento
y_pred = clf.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred))

[[132  32]
 [ 28  75]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       164
           1       0.70      0.73      0.71       103

   micro avg       0.78      0.78      0.78       267
   macro avg       0.76      0.77      0.76       267
weighted avg       0.78      0.78      0.78       267



In [11]:
#Comparemos el resultado con la regresion lineal.
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)
y_pred = LogReg.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[[135  29]
 [ 35  68]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       164
           1       0.70      0.66      0.68       103

   micro avg       0.76      0.76      0.76       267
   macro avg       0.75      0.74      0.74       267
weighted avg       0.76      0.76      0.76       267



**Podemos ver en el analisis que el metodo del arbol tiene buen rendimiento, y da resultados muy similares a la regresion linear, aunque la diferencia podria ser importante depeniendo de con que fines se realize el analisis.**