# Titanic App

¿Sobreviviria tu pasajero?

**librerias**

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import matplotlib.pyplot as plt
plt.rcdefaults()

import pandas as pd
import numpy as np
from numpy import genfromtxt

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

**datos**

Incluyen caracteristicas como edad, numero de ticket, cabina, etc..

El objetivo es clasificar-predecir la supervivencia.

In [3]:
df=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')

df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


**exploracion**

In [4]:
df.shape

(1309, 14)

In [5]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 601.9 KB


In [7]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [8]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


**preparando datos**

In [9]:
# tratando los NaN

# letra de la cabina
df.cabin=df.cabin.replace(np.NaN, 'U') 
df.cabin=[e[0] for e in df.cabin.values]
df.cabin=df.cabin.replace('U', 'Unknown')

In [10]:
# titulo (Señor-Señora)
df['title']=[e.split()[1] for e in df.name.values]


df.title=[t if t in ['Mr.', 'Miss.', 
                     'Mrs.', 'Master.', 
                     'Dr.', 'Rev.'] else 'Unknown'
          for t in df.title.values ]


In [11]:
df.title.value_counts()

Mr.        736
Miss.      256
Mrs.       191
Master.     59
Unknown     51
Rev.         8
Dr.          8
Name: title, dtype: int64

In [12]:
# se crea el campo de si es mujer
df['isfemale']=np.where(df.sex=='female', 1, 0)

# se eliminan columnas innecesarias
df=df[[f for f in list(df) if f not in ['sex', 
                                        'name', 
                                        'boat',
                                        'body', 
                                        'ticket', 
                                        'home.dest']]]

# pclass a categorico string
df.pclass=np.where(df.pclass== 1,'First', 
                   np.where(df.pclass==2, 
                            'Second', 'Third'))

# nulos por desconocido en embarque
df.embarked=df.embarked.replace(np.NaN, 'Unknown') 

In [13]:
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,cabin,embarked,title,isfemale
0,First,1,29.0,0,0,211.3375,B,S,Miss.,1
1,First,1,0.92,1,2,151.55,C,S,Master.,0
2,First,0,2.0,1,2,151.55,C,S,Miss.,1
3,First,0,30.0,1,2,151.55,C,S,Mr.,0
4,First,0,25.0,1,2,151.55,C,S,Mrs.,1


In [14]:
# one-hot encoding demlas variables categoricas

def one_hot(df, columns, drop_first=True, make_na_col=True): 
    dummy=pd.get_dummies(df, 
                         columns=columns, 
                         drop_first=drop_first, 
                         dummy_na=make_na_col)
    return dummy

In [15]:
df=one_hot(df, columns=['pclass', 'cabin', 'embarked', 'title'])
df=df.dropna()
df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,isfemale,pclass_Second,pclass_Third,pclass_nan,cabin_B,...,embarked_S,embarked_Unknown,embarked_nan,title_Master.,title_Miss.,title_Mr.,title_Mrs.,title_Rev.,title_Unknown,title_nan
0,1,29.0,0,0,211.3375,1,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
1,1,0.92,1,2,151.55,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,2.0,1,2,151.55,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,0,30.0,1,2,151.55,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,0,25.0,1,2,151.55,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## Modelo

In [16]:
# train-test split

X=df.drop('survived', axis=1)
y=df.survived


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2,
                                                    random_state=42)

In [17]:
logreg=LogReg()

logreg.fit(X_train, y_train.values.ravel());

In [18]:
y_pred=logreg.predict(X_test)


print('Accuracy : {:.2f}%'.format(logreg.score(X_test, y_test)*100))

Accuracy : 76.56%


In [25]:
# interpretacion de los coeficientes de logreg
print ('Coeficientes:')

logreg.coef_

array([[-2.75765583e-02, -4.52832070e-01, -1.80017317e-01,
         3.26911104e-03,  2.23981328e+00, -4.11202504e-01,
        -1.04696597e+00,  0.00000000e+00,  1.29735532e-01,
        -2.93444449e-01,  4.09772518e-01,  8.62783145e-01,
         2.27769967e-01, -9.20792515e-01, -2.38367195e-01,
        -7.00323669e-01,  0.00000000e+00, -1.00740816e+00,
        -2.66419138e-01,  1.08415599e-01,  0.00000000e+00,
         1.60665761e+00, -7.61775350e-02, -6.77347488e-01,
         6.50635629e-01, -5.03932554e-01,  1.41029664e-03,
         0.00000000e+00]])

In [22]:
coefs = pd.DataFrame({'Carac':X.columns, 'Coef':logreg.coef_[0]})
print('Caracteristicas positivas:')
coefs.sort_values('Coef', ascending=False).head(7)

Caracteristicas positivas


Unnamed: 0,Carac,Coef
4,isfemale,2.239813
21,title_Master.,1.606658
11,cabin_E,0.862783
24,title_Mrs.,0.650636
10,cabin_D,0.409773
12,cabin_F,0.22777
8,cabin_B,0.129736


In [23]:
print('Caracteristicas negativas:')
coefs.sort_values('Coef', ascending=False).tail(7)

Caracteristicas negativas


Unnamed: 0,Carac,Coef
1,sibsp,-0.452832
25,title_Rev.,-0.503933
23,title_Mr.,-0.677347
15,cabin_Unknown,-0.700324
13,cabin_G,-0.920793
17,embarked_Q,-1.007408
6,pclass_Third,-1.046966
