# Titanic App

¿Sobreviviria tu pasajero?

**librerias**

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.model_selection import train_test_split as tts

**datos**

Incluyen caracteristicas como edad, numero de ticket, cabina, etc..

El objetivo es clasificar-predecir la supervivencia.

In [None]:
df=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')

df.head()

**exploracion**

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info(memory_usage='deep')

In [None]:
df.isna().sum()

In [None]:
df.describe()

**preparando datos**

In [None]:
# tratando los NaN

# letra de la cabina
df.cabin=df.cabin.replace(np.NaN, 'U') 
df.cabin=[e[0] for e in df.cabin.values]
df.cabin=df.cabin.replace('U', 'Unknown')

In [None]:
# titulo (Señor-Señora)
df['title']=[e.split()[1] for e in df.name.values]


df.title=[t if t in ['Mr.', 'Miss.', 
                     'Mrs.', 'Master.', 
                     'Dr.', 'Rev.'] else 'Unknown'
          for t in df.title.values ]


In [None]:
df.title.value_counts()

In [None]:
# se crea el campo de si es mujer
df['isfemale']=np.where(df.sex=='female', 1, 0)

# se eliminan columnas innecesarias
df=df.drop(columns=['sex', 'name', 'boat', 'body', 'ticket', 'home.dest'])

# pclass a categorico string
df.pclass=np.where(df.pclass== 1,'First', 
                   np.where(df.pclass==2, 
                            'Second', 'Third'))

# nulos por desconocido en embarque
df.embarked=df.embarked.replace(np.NaN, 'Unknown') 

In [None]:
df.head()

In [None]:
# one-hot encoding de las variables categoricas

def one_hot(df, columns, drop_first=True): 
    dummy=pd.get_dummies(df, 
                         columns=columns, 
                         drop_first=drop_first)
    return dummy

In [None]:
df_dummy=one_hot(df, columns=['pclass', 'cabin', 'embarked', 'title'])
df_dummy=df_dummy.dropna()
df_dummy.head()

## Modelo

In [None]:
# train-test split

X=df_dummy.drop('survived', axis=1)
y=df_dummy.survived


X_train, X_test, y_train, y_test=tts(X, y, test_size=.2, random_state=42)

In [None]:
logreg=LogReg()

logreg.fit(X_train, y_train.values);

In [None]:
y_pred=logreg.predict(X_test)


print('Accuracy : {:.2f}%'.format(logreg.score(X_test, y_test)*100))

In [None]:
# interpretacion de los coeficientes de logreg
print ('Coeficientes:')

logreg.coef_

In [None]:
coefs=pd.DataFrame({'Carac':X.columns, 'Coef':logreg.coef_[0]})
print('Caracteristicas positivas:')
coefs.sort_values('Coef', ascending=False).head(7)

In [None]:
print('Caracteristicas negativas:')
coefs.sort_values('Coef', ascending=False).tail(7)

## Prediccion a 'mano'

In [None]:
# se crea un pasajero ficticio


pasajero={
    'pclass':'Third', # First, Second, Third (Clase)
    'isfemale':1,     # 0-1  (Genero)
    'age':20,         # 0-100 (Edad)
    'sibsp':3,        # (0-8) (Hermanos, esposos)
    'parch':0,        # (0-9)  (Padres, hijos)
    'fare':200,       # (0-500) (Tarifa)
    'cabin':'A',      # 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'T', 'Z'  (Cabina)
    'embarked':'Q',   # 'Q', 'S', 'C' # Queenstown Southampton Cherbourg (Puerto)
    'title':'Mrs.',   # 'Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Rev.', Unknown (Titulo)
    'survived':0      # para concatenar dfs
}
 

In [None]:
pas_df=pd.DataFrame.from_dict(pasajero, orient='index').T
pas_df

In [None]:
df.head()

In [None]:
pred_df=pd.concat([pas_df, df], ignore_index=True)
pred_df.head()

In [None]:
pred_df=one_hot(pred_df,
               columns=['pclass', 'cabin', 'embarked', 'title'])
pred_df.head()

In [None]:
# prediccion

x=pred_df[X.columns].head(1) # datos del pasajero

y_pred=logreg.predict(x)

y_prob=logreg.predict_proba(x)

print ('Etiqueta:', y_pred) 
print ()
print ('Probabilidad [0 , 1]:', y_prob[0])
print ()
print ('Prob supervivencia:', y_prob[0][1])

## Plot

In [None]:
tasa_media_super=np.mean(df.survived)*100
print ('Tasa media supervivencia: {:.2f}%'.format(tasa_media_super))

In [None]:
with plt.xkcd():
    plt.figure(figsize=(10,8))

    plt.bar(range(2),
            [tasa_media_super, y_prob[0][1]*100],
            align='center', 
            color=['y', 'b'], 
            alpha=0.5)

    plt.xticks(range(2), ['Tasa Supervivencia media', 'Pasajero'])

    plt.axhline(tasa_media_super, color='r')
    plt.ylim([0,100])
    plt.ylabel('Probabilidad Supervivencia')
    plt.title('¿Sobrevivirá tu pasajero? \n '+'¡{:.2f}% de probabilidad!'.format(y_prob[0][1]*100))

    plt.show();

# Preparando Data para Produccion


In [None]:
def data():
    
    df=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')
    
    df.cabin=df.cabin.replace(np.NaN, 'U') 
    df.cabin=[e[0] for e in df.cabin.values]
    df.cabin=df.cabin.replace('U', 'Unknown')
    
    df['title']=[e.split()[1] for e in df.name.values]
    df.title=[t if t in ['Mr.', 'Miss.', 
                         'Mrs.', 'Master.', 
                         'Dr.', 'Rev.'] else 'Unknown'
              for t in df.title.values ]
    
    df['isfemale']=np.where(df.sex=='female', 1, 0)
    
    df=df.drop(columns=['sex', 'name', 'boat', 'body', 'ticket', 'home.dest'])


    df.pclass=np.where(df.pclass== 1,'First', 
                       np.where(df.pclass==2, 
                                'Second', 'Third'))

    df.embarked=df.embarked.replace(np.NaN, 'Unknown')
    
    df_dummy=pd.get_dummies(df, 
                            columns=['pclass', 'cabin', 
                                     'embarked', 'title'], 
                            drop_first=True)
    
    df_dummy=df_dummy.dropna()
    
    df_dummy.to_csv('data/titanic.csv', header=False, index=False)

    
    
    
data()  # se guarda el df sin nombres de columnas ni indice

# Web-App (main.py)

In [None]:
from flask import Flask
from flask import render_template
from flask import request
from flask import Markup

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import io
import os
import base64

import numpy as np
from numpy import genfromtxt
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.model_selection import train_test_split as tts
 
# necesario en pythonanywhere
#PATH=os.path.dirname(os.path.abspath(__file__))
    
# default inicial
EMBARKED='Southampton'
FARE=33
AGE=30
GENDER='Female'
TITLE='Mrs.'
CLASS='Second'
CABIN='C'
SIBSP=0
PARCH=0


# inicializando variables
tasa_media=0



# modelo 
logreg=LogReg()



# flask app
app=Flask(__name__)



# antes del primer request...
@app.before_first_request
def startup():
    global tasa_media, logreg
    
    #data=genfromtxt(PATH+'/data/titanic.csv', delimiter=',') # fuera de jupyter
    data=genfromtxt('data/titanic.csv', delimiter=',')
    
    tasa_media=(np.mean([e[0] for e in data])*100)

    X_train, X_test, y_train, y_test=tts([e[1:] for e in data], 
                                         [e[0] for e in data], 
                                         test_size=0.1, 
                                         random_state=42)
    
    logreg.fit(X_train, y_train)  # se entrena una vez antes de arrancar
    
    

    
# main app
@app.route("/", methods=['POST', 'GET'])
def main():
    
    if request.method=='POST':
        s_embarked=request.form['s_embarked']
        s_fare=request.form['s_fare']
        s_age=request.form['s_age']
        s_gender=request.form['s_gender']
        s_title=request.form['s_title']
        s_class=request.form['s_class']
        s_cabin=request.form['s_cabin']
        s_sibsp=request.form['s_sibsp']
        s_parch=request.form['s_parch']
        
        # se reasigna para prediccion
        age=int(s_age)
        isfemale=1 if s_gender=='Female' else 0
        sibsp=int(s_sibsp)
        parch=int(s_parch)
        fare=int(s_fare)
        
        
        # puerto de embarque
        embarked_Q=0
        embarked_S=0
        embarked_Unknown=0 
        if (s_embarked[0]=='Q'):
            embarked_Q = 1
        if (s_embarked[0]=='S'):
            embarked_S = 1
        if (s_embarked[0]=='Unknown'):
            embarked_Unknown = 1
            
        
        # clase
        pclass_Second=0
        pclass_Third=0
        if (s_class=='Second'):
            pclass_Second=1
        if (s_class=='Third'):
            pclass_Third=1
            
            
        
        # titulo
        title_Master=0
        title_Miss=0
        title_Mr=0
        title_Mrs=0
        title_Rev=0
        title_Unknown=0
        if (s_title=='Master.'):
            title_Master=1
        if (s_title=='Miss.'):
            title_Miss=1
        if (s_title=='Mr.'):
            title_Mr=1
        if (s_title=='Mrs.'):
            title_Mrs=1
        if (s_title=='Rev.'):
            title_Master=1
        if (s_title=='Unknown'):
            title_Unknown=1
            
            
        # cabina
        cabin_B=0
        cabin_C=0  
        cabin_D=0  
        cabin_E=0
        cabin_F=0
        cabin_G=0
        cabin_T=0
        cabin_Unknown=0
        if (s_cabin=='B'):
            cabin_B=1
        if (s_cabin=='C'):
            cabin_C=1
        if (s_cabin=='D'):
            cabin_D=1
        if (s_cabin=='E'):
            cabin_E=1
        if (s_cabin=='F'):
            cabin_F=1
        if (s_cabin=='G'):
            cabin_G=1
        if (s_cabin=='T'):
            cabin_T=1
        if (s_cabin=='Unknown'):
            cabin_Unknown=1
            
            
        
        # pasajero
        pasajero=[[age, sibsp, parch, fare, isfemale, 
                   pclass_Second, pclass_Third,  
                   cabin_B, cabin_C, cabin_D, cabin_E, cabin_F, cabin_G,
                   cabin_T, cabin_Unknown, embarked_Q, 
                   embarked_S, embarked_Unknown,  
                   title_Master, title_Miss, title_Mr, title_Mrs, 
                   title_Rev, title_Unknown]]
        
        
        # prediccion
        y_prob=logreg.predict_proba(pasajero)
        
        
        # plot
        with plt.xkcd():
            plt.figure()
            plt.bar(range(2),[tasa_media, y_prob[0][1]*100],
                    align='center', color=['y', 'b'], alpha=0.5)
            
            plt.xticks(range(2), ['Tasa Supervivencia media', 'Pasajero'])
            plt.axhline(tasa_media, color='r')
            plt.ylim([0,100])
            plt.ylabel('Probabilidad Supervivencia')
            plt.title('¿Sobrevivirá tu pasajero? \n '+'¡{:.2f}% de probabilidad!'.format(y_prob[0][1]*100))
            img=io.BytesIO()
            plt.savefig(img, format='png')
            img.seek(0)
            plot_url=base64.b64encode(img.getvalue()).decode()
            
        
        return render_template('index.html',
            model_results='',
            model_plot=Markup('<img src="data:image/png;base64,{}">'.format(plot_url)),
            s_embarked=s_embarked,
            s_fare=s_fare,
            s_age=s_age,
            s_gender=s_gender,
            s_title=s_title,
            s_class=s_class,
            s_cabin=s_cabin,
            s_sibsp=s_sibsp,
            s_parch=s_parch)
    
    else:
        # parametros por defecto
        return render_template('index.html',
            model_results = '',
            model_plot = '',
            s_embarked=EMBARKED,
            s_fare=FARE,
            s_age=AGE,
            s_gender=GENDER,
            s_title=TITLE,
            s_class=CLASS,
            s_cabin=CABIN,
            s_sibsp=SIBSP,
            s_parch=PARCH)
    
    

# solo en local
if __name__=='__main__':
    app.run(debug=False)
    

## Codigo HTML