# Imports and Functions

In [501]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

## Colors

In [502]:
# Terminal ASCII colors
WHITE = '\033[39m'; CYAN  = '\033[36m'; ORANGE = '\033[93m';

# Palettes
URBAN_PALETTE_CONTINUOUS = {
    'cyan'    : '#1696d2',
    'gray'    : '#d2d2d2',
    'black'   : '#000000',
    'yellow'  : '#fdbf11',
    'magenta' : '#ec008b',
    'green'   : '#55b748',
    'red'     : '#db2b27',
}
URBAN_PALETTE_CATEGORICAL = {
    'cyan'    : '#1696d2',
    'gray'    : '#d2d2d2',
    'magenta' : '#ec008b',
    'yellow'  : '#fdbf11',
    'dark'    : '#332d2f',
    'ocean'   : '#0a4c6a',
}
BLUE_SHADE = ["#CFE8F3","#A2D4EC","#73BFE2","#46ABDB","#1696D2","#12719E","#0A4C6A","#062635"]
YELLOW_SHADE = ["#FFF2CF","#FCE39E","#FDD870","#FCCB41","#FDBF11","#E88E2D","#CA5800","#843215"]

## Classification Metrics

### Precision-recall curve

In [503]:
def plot_precision_recall_curve(y_train, y_proba_train, y_test, y_proba_test):
    
    # Get precisions, recalls and thresholds
    train_precisions, train_recalls, train_thresholds = precision_recall_curve(y_train, y_proba_train)
    test_precisions, test_recalls, test_thresholds = precision_recall_curve(y_test, y_proba_test)
    # Create Precision-Recall curve figure
    fig = go.Figure()
    # Add Precision lines
    fig.add_trace(go.Scatter(x=train_thresholds, y=train_precisions[:-1], name='Train Precision', line_color=YELLOW_SHADE[5]))
    fig.add_trace(go.Scatter(x=test_thresholds, y=test_precisions[:-1], name='Test Precision', line_color=BLUE_SHADE[5]))
    # Add Recall lines
    fig.add_trace(go.Scatter(x=train_thresholds, y=train_recalls[:-1], name='Train Recall', line_color=YELLOW_SHADE[-1]) )
    fig.add_trace(go.Scatter(x=test_thresholds, y=test_recalls[:-1], name='Test Recall', line_color=BLUE_SHADE[-1]) )
    # Layout update
    fig.update_layout(title_text='Precision-Recall curve', width=900, height=500)
    
    return fig

### ROC_AUC curve

In [504]:
def plot_roc_auc_curve(y_train, y_proba_train, y_test, y_proba_test):
    
    train_fpr, train_tpr, _ = roc_curve(y_train, y_proba_train)
    test_fpr, test_tpr, _ = roc_curve(y_test, y_proba_test)

    fig = go.Figure()

    fig.add_traces([
        go.Scatter(
            x=train_fpr, y=train_tpr, fill='tozeroy',
            name=f'Train dataset AUC={auc(train_fpr, train_tpr):.3f}',
            line=dict(width=2, color=URBAN_PALETTE_CATEGORICAL['yellow']), 
            stackgroup='one'
        ),
        go.Scatter(
            x=test_fpr, y=test_tpr, fill='tozeroy',
            name=f'Test dataset AUC={auc(test_fpr, test_tpr):.3f}',
            line=dict(width=2, color=URBAN_PALETTE_CATEGORICAL['cyan']),
            
        )
    ]).update_layout(
        title_text='ROC AUC Curve',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        width=700, height=500)

    # Add curve line
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    # Layout update
    fig.update_yaxes(scaleanchor='x', scaleratio=1)
    fig.update_xaxes(constrain='domain')
    
    return fig

### Confusion Matrix

In [505]:
def plot_confusion_matrix(y_train, y_pred_train, y_test, y_pred_test, labels=None):
    
    if not labels:
        labels = y_train.unique().tolist()
    # Calculate confusion matrix
    train_matrix = confusion_matrix(y_train, y_pred_train)
    test_matrix = confusion_matrix(y_test, y_pred_test)
    
    # Create subplots
    cf_matrix = make_subplots(rows=1, cols=2, shared_xaxes=True, horizontal_spacing=0.2,
                       subplot_titles=('Train Dataset', 'Test Dataset'))
    cf_matrix.add_trace(
        # Plot Train CF Matrix
        go.Heatmap(
            z=train_matrix,
            x=labels, y=labels,
            text=train_matrix,
            texttemplate="%{text}", textfont={"size":14},
            hovertemplate='Predicted Label: %{x}<br>True Label: %{y}<br>#: %{z}<extra></extra>',
            colorscale=YELLOW_SHADE,
            colorbar_x=0.4), 
            row=1, col=1
    ).add_trace(
        # Plot Test CF Matrix
        go.Heatmap(
            z=test_matrix,
            x=labels, y=labels,
            text=test_matrix,
            texttemplate="%{text}", textfont={"size":14},
            hovertemplate='Predicted Label: %{x}<br>True Label: %{y}<br>#: %{z}<extra></extra>',
            colorscale=BLUE_SHADE,
            colorbar_x=1), 
            row=1, col=2
    ).update_layout(width=900, height=500, title_text='Confusion Matrix')
    
    return cf_matrix

### Display classification report

In [506]:
def display_clasification_report(y_train, y_pred_train, y_test, y_pred_test):
    train_report = pd.DataFrame(classification_report(y_train, y_pred_train, output_dict=True)).T
    test_report =pd.DataFrame(classification_report(y_test, y_pred_test, output_dict=True)).T
    report = pd.concat([train_report, test_report], axis=1, keys=['Train dataset', 'Test dataset']).apply(round, ndigits=3)
    return report

### Metrics all in one

In [507]:
def metrics_report(y_train, y_test, y_pred_train, y_pred_test, y_proba_train=None, y_proba_test=None, labels=None, plot_roc=True, plot_pr=True): 
    display(display_clasification_report(y_train, y_pred_train, y_test, y_pred_test))
    plot_confusion_matrix(y_train, y_pred_train, y_test, y_pred_test, labels=labels).show()
    if plot_roc:
        plot_roc_auc_curve(y_train, y_proba_train[:,1], y_test, y_proba_test[:,1]).show()
    if plot_pr:
        plot_precision_recall_curve(y_train, y_proba_train[:,1], y_test, y_proba_test[:,1]).show()

# Exercícios

## Utilizando o dataset do `titanic`, faça os itens a seguir:

## __A)__  Prepare os dados para ser treinados nos modelos;

In [508]:
df = sns.load_dataset('titanic')
df.drop(['alive', 'deck', 'pclass'], axis=1, inplace=True)
df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alone
0,0,male,22.0,1,0,7.25,S,Third,man,True,Southampton,False
1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,False
2,1,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,True
3,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,False
4,0,male,35.0,0,0,8.05,S,Third,man,True,Southampton,True


In [509]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   sex          891 non-null    object  
 2   age          714 non-null    float64 
 3   sibsp        891 non-null    int64   
 4   parch        891 non-null    int64   
 5   fare         891 non-null    float64 
 6   embarked     889 non-null    object  
 7   class        891 non-null    category
 8   who          891 non-null    object  
 9   adult_male   891 non-null    bool    
 10  embark_town  889 non-null    object  
 11  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(3), object(4)
memory usage: 65.5+ KB


In [510]:
numeric_features = df.drop('survived', axis=1).select_dtypes(include=np.number).columns
numeric_features

Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')

In [511]:
ordinal_features = df.select_dtypes(include='category').columns
ordinal_features

Index(['class'], dtype='object')

In [512]:
categorical_features = df.select_dtypes(include='object').columns
categorical_features

Index(['sex', 'embarked', 'who', 'embark_town'], dtype='object')

In [513]:
# Split data
from sklearn.model_selection import train_test_split
X = df.drop('survived', axis=1) 
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

In [514]:
# Preprocessing steps
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features_pipeline = Pipeline([
    ('impute_num', SimpleImputer(strategy='mean')),
    ('std', StandardScaler())
])

categorical_features_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='unknow')),
    ('onehot', OneHotEncoder())
])

ordinal_features_pipeline = Pipeline([
    ('ord_imputer', SimpleImputer(strategy='constant', fill_value='unknow')),
    ('ord_enc', OrdinalEncoder())
])

ct_pipeline = ColumnTransformer([
    ('numeric_transformer', numeric_features_pipeline, numeric_features),
    ('categorical_transformer', categorical_features_pipeline, categorical_features),
    ('ordinal_transformer', ordinal_features_pipeline , ordinal_features),
])

## __B)__ Utilizando o modelo _Random Forest_, faça a predição dos sobreviventes seperando a base de treino e teste na proporção 80%/20%.

In [515]:
from sklearn.ensemble import RandomForestClassifier

# Build RandomForest pipeline and fit model
rf_pipeline = Pipeline([
    ('preprocess', ct_pipeline),
    ('rf', RandomForestClassifier(random_state=42))
]).fit(X_train, y_train)

In [516]:
# Make predictions
y_pred_train, y_pred_test = rf_pipeline.predict(X_train), rf_pipeline.predict(X_test)
y_proba_train, y_proba_test = rf_pipeline.predict_proba(X_train), rf_pipeline.predict_proba(X_test)

In [517]:
# Display Metrics
metrics_report(y_train, y_test, y_pred_train, y_pred_test, y_proba_train, y_proba_test)

Unnamed: 0_level_0,Train dataset,Train dataset,Train dataset,Train dataset,Test dataset,Test dataset,Test dataset,Test dataset
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.98,0.993,0.986,439.0,0.838,0.891,0.863,110.0
1,0.989,0.967,0.978,273.0,0.806,0.725,0.763,69.0
accuracy,0.983,0.983,0.983,0.983,0.827,0.827,0.827,0.827
macro avg,0.984,0.98,0.982,712.0,0.822,0.808,0.813,179.0
weighted avg,0.983,0.983,0.983,712.0,0.826,0.827,0.825,179.0


## __C)__ Treine um modelo _Perceptron_ para a classificação dos sobreviventes na mesma proporção (80/20%).

In [518]:
from sklearn.linear_model import Perceptron

# Build RandomForest pipeline and fit model
ppn_pipeline = Pipeline([
    ('preprocess', ct_pipeline),
    ('ppn', Perceptron(random_state=42))
]).fit(X_train, y_train)

In [519]:
# Make predictions
y_pred_train, y_pred_test = perc_pipeline.predict(X_train), perc_pipeline.predict(X_test)

In [520]:
# Display Metrics
metrics_report(y_train, y_test, y_pred_train, y_pred_test, labels=['dead', 'survived'], plot_roc=False, plot_pr=False)

Unnamed: 0_level_0,Train dataset,Train dataset,Train dataset,Train dataset,Test dataset,Test dataset,Test dataset,Test dataset
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.859,0.747,0.799,439.0,0.844,0.736,0.786,110.0
1,0.664,0.802,0.726,273.0,0.651,0.783,0.711,69.0
accuracy,0.768,0.768,0.768,0.768,0.754,0.754,0.754,0.754
macro avg,0.761,0.775,0.763,712.0,0.747,0.759,0.748,179.0
weighted avg,0.784,0.768,0.771,712.0,0.769,0.754,0.757,179.0


## __D)__ Treine uma rede neural simples com as seguintes características:
- Uma camada oculta com 18 neurônios e ativação ReLU;
- Uma camada oculta com 50 neurônios e ativação ReLU;
- Uma camada oculta com 5 neurônios e ativação ReLU;
- Camada de Saída com ativação Sigmoíde;  
Utilize como otimizador o `adam`, função de perda o `binary_corssentropy` e me´trica de acompanhamento a acurácia.

In [521]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

In [522]:
# Transform data
X_train_transf = ct_pipeline.fit_transform(X_train, y_train)
X_test_transf = ct_pipeline.fit_transform(X_test, y_test)

In [523]:
# Network configuration
nn = Sequential()
# Input layer
nn.add(Dense(
    units=18,
    activation=Activation('relu'),
    input_dim=X_train_transf.shape[1])
)
# Hidden layers layer
for layer_length in [50, 5]:
    nn.add(Dense(
        units=layer_length,
        activation=Activation('relu'))
    )
# Output layer
nn.add(Dense(1, Activation('sigmoid')))

In [524]:
nn.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)
nn.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_53 (Dense)            (None, 18)                342       
                                                                 
 dense_54 (Dense)            (None, 50)                950       
                                                                 
 dense_55 (Dense)            (None, 5)                 255       
                                                                 
 dense_56 (Dense)            (None, 1)                 6         
                                                                 
Total params: 1,553
Trainable params: 1,553
Non-trainable params: 0
_________________________________________________________________


In [532]:
hist = nn.fit(
    X_train_transf, y_train,
    validation_data=(X_test_transf, y_test),
    epochs=100, verbose=1)

In [None]:
# loss e val_loss
loss = hist.history['loss']
val_loss = hist.history['val_loss']
epochs = range(len(loss))

In [526]:
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

def get_model(meta):
    n_features_in_ = meta['n_features_in_']
    X_shape_ = meta['X_shape_']
    n_classes_ = meta['n_classes_']

    model = Sequential()
    # Input layer
    model.add(Dense(n_features_in_, 'relu', input_shape=X_shape_[1:]))
   # Hidden layers
    [model.add(Dense(layer_length, 'relu')) for layer_length in [50,5]]
    # Output layer
    model.add(Dense(1, 'sigmoid'))
    return model

clf = KerasClassifier(
    get_model,
    epochs=100,
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

nn_pipeline = Pipeline([
    ('preprocess', ct_pipeline),
    ('nn', clf)
]).fit(X_train, y_train)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [527]:
y_pred_train = nn_pipeline.predict(X_train)
y_pred_test = nn_pipeline.predict(X_test)



In [528]:
y_proba_train = nn_pipeline.predict_proba(X_train)
y_proba_test = nn_pipeline.predict_proba(X_test)



In [529]:
# Display Metrics
metrics_report(y_train, y_test, y_pred_train, y_pred_test, y_proba_train, y_proba_test, labels=['dead', 'survived'])

Unnamed: 0_level_0,Train dataset,Train dataset,Train dataset,Train dataset,Test dataset,Test dataset,Test dataset,Test dataset
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.869,0.936,0.901,439.0,0.811,0.9,0.853,110.0
1,0.883,0.773,0.824,273.0,0.807,0.667,0.73,69.0
accuracy,0.874,0.874,0.874,0.874,0.81,0.81,0.81,0.81
macro avg,0.876,0.855,0.863,712.0,0.809,0.783,0.792,179.0
weighted avg,0.874,0.874,0.872,712.0,0.81,0.81,0.806,179.0


## __E)__ Qual o modelo teve o melhor desempenho?

In [530]:
## 

In [531]:
import session_info
session_info.show()