# Imports and Functions

In [360]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

## Colors

In [333]:
# Terminal ASCII colors
WHITE = '\033[39m'; CYAN  = '\033[36m'; ORANGE = '\033[93m';

# Palettes
URBAN_PALETTE_CONTINUOUS = {
    'cyan'    : '#1696d2',
    'gray'    : '#d2d2d2',
    'black'   : '#000000',
    'yellow'  : '#fdbf11',
    'magenta' : '#ec008b',
    'green'   : '#55b748',
    'red'     : '#db2b27',
}
URBAN_PALETTE_CATEGORICAL = {
    'cyan'    : '#1696d2',
    'gray'    : '#d2d2d2',
    'magenta' : '#ec008b',
    'yellow'  : '#fdbf11',
    'dark'    : '#332d2f',
    'ocean'   : '#0a4c6a',
}
BLUE_SHADE = ["#CFE8F3","#A2D4EC","#73BFE2","#46ABDB","#1696D2","#12719E","#0A4C6A","#062635"]
YELLOW_SHADE = ["#FFF2CF","#FCE39E","#FDD870","#FCCB41","#FDBF11","#E88E2D","#CA5800","#843215"]

## Classification Metrics

### Precision-recall curve

In [331]:
def plot_precision_recall_curve(y_train, y_proba_train, y_test, y_proba_test):
    
    # Get precisions, recalls and thresholds
    train_precisions, train_recalls, train_thresholds = precision_recall_curve(y_train, y_proba_train)
    test_precisions, test_recalls, test_thresholds = precision_recall_curve(y_test, y_proba_test)
    # Create Precision-Recall curve figure
    fig = go.Figure()
    # Add Precision lines
    fig.add_trace(go.Scatter(x=train_thresholds, y=train_precisions[:-1], name='Train Precision', line_color=YELLOW_SHADE[5]))
    fig.add_trace(go.Scatter(x=test_thresholds, y=test_precisions[:-1], name='Test Precision', line_color=BLUE_SHADE[5]))
    # Add Recall lines
    fig.add_trace(go.Scatter(x=train_thresholds, y=train_recalls[:-1], name='Train Recall', line_color=YELLOW_SHADE[-1]) )
    fig.add_trace(go.Scatter(x=test_thresholds, y=test_recalls[:-1], name='Test Recall', line_color=BLUE_SHADE[-1]) )
    # Layout update
    fig.update_layout(title_text='Precision-Recall curve', width=900, height=500)
    
    return fig

### ROC_AUC curve

In [368]:
def plot_roc_auc_curve(y_train, y_proba_train, y_test, y_proba_test):
    
    train_fpr, train_tpr, _ = roc_curve(y_train, y_proba_train)
    test_fpr, test_tpr, _ = roc_curve(y_test, y_proba_test)

    fig = go.Figure()

    fig.add_traces([
        go.Scatter(
            x=train_fpr, y=train_tpr, fill='tozeroy',
            name=f'Train dataset AUC={auc(train_fpr, train_tpr):.3f}',
            line=dict(width=2, color=URBAN_PALETTE_CATEGORICAL['yellow']), 
            stackgroup='one'
        ),
        go.Scatter(
            x=test_fpr, y=test_tpr, fill='tozeroy',
            name=f'Test dataset AUC={auc(test_fpr, test_tpr):.3f}',
            line=dict(width=2, color=URBAN_PALETTE_CATEGORICAL['cyan']),
            
        )
    ]).update_layout(
        title_text='ROC AUC Curve',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        width=700, height=500)

    # Add curve line
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    # Layout update
    fig.update_yaxes(scaleanchor='x', scaleratio=1)
    fig.update_xaxes(constrain='domain')
    
    return fig

### Confusion Matrix

In [362]:
def plot_confusion_matrix(y_train, y_pred_train, y_test, y_pred_test, labels=None):
    
    if not labels:
        labels = y_train.unique().tolist()
    # Calculate confusion matrix
    train_matrix = confusion_matrix(y_train, y_pred_train)
    test_matrix = confusion_matrix(y_test, y_pred_test)
    
    # Create subplots
    cf_matrix = make_subplots(rows=1, cols=2, shared_xaxes=True, horizontal_spacing=0.2,
                       subplot_titles=('Train Dataset', 'Test Dataset'))
    cf_matrix.add_trace(
        # Plot Train CF Matrix
        go.Heatmap(
            z=train_matrix,
            x=labels, y=labels,
            text=train_matrix,
            texttemplate="%{text}", textfont={"size":14},
            hovertemplate='Predicted Label: %{x}<br>True Label: %{y}<br>#: %{z}<extra></extra>',
            colorscale=YELLOW_SHADE,
            colorbar_x=0.4), 
            row=1, col=1
    ).add_trace(
        # Plot Test CF Matrix
        go.Heatmap(
            z=test_matrix,
            x=labels, y=labels,
            text=test_matrix,
            texttemplate="%{text}", textfont={"size":14},
            hovertemplate='Predicted Label: %{x}<br>True Label: %{y}<br>#: %{z}<extra></extra>',
            colorscale=BLUE_SHADE,
            colorbar_x=1), 
            row=1, col=2
    ).update_layout(width=900, height=500, title_text='Confusion Matrix')
    
    return cf_matrix

### Display classification report

In [357]:
def display_clasification_report(y_train, y_pred_train, y_test, y_pred_test):
    train_report = pd.DataFrame(classification_report(y_train, y_pred_train, output_dict=True)).T
    test_report =pd.DataFrame(classification_report(y_test, y_pred_test, output_dict=True)).T
    report = pd.concat([train_report, test_report], axis=1, keys=['Train dataset', 'Test dataset']).apply(round, ndigits=3)
    return report

### Metrics all in one

In [385]:
def metrics_report(y_train, y_test, y_pred_train, y_pred_test, y_proba_train=None, y_proba_test=None, plot_roc=True, plot_pr=True): 
    display(display_clasification_report(y_train, y_pred_train, y_test, y_pred_test))
    plot_confusion_matrix(y_train, y_pred_train, y_test, y_pred_test, labels=['dead', 'survived']).show()
    if plot_roc:
        plot_roc_auc_curve(y_train, y_proba_train[:,1], y_test, y_proba_test[:,1]).show()
    if plot_pr:
        plot_precision_recall_curve(y_train, y_proba_train[:,1], y_test, y_proba_test[:,1]).show()

# Exercícios

## Utilizando o dataset do `titanic`, faça os itens a seguir:

### __A)__  Prepare os dados para ser treinados nos modelos;

In [57]:
df = sns.load_dataset('titanic')
df.drop(['alive', 'deck', 'pclass'], axis=1, inplace=True)
df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alone
0,0,male,22.0,1,0,7.25,S,Third,man,True,Southampton,False
1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,False
2,1,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,True
3,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,False
4,0,male,35.0,0,0,8.05,S,Third,man,True,Southampton,True


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   sex          891 non-null    object  
 2   age          714 non-null    float64 
 3   sibsp        891 non-null    int64   
 4   parch        891 non-null    int64   
 5   fare         891 non-null    float64 
 6   embarked     889 non-null    object  
 7   class        891 non-null    category
 8   who          891 non-null    object  
 9   adult_male   891 non-null    bool    
 10  embark_town  889 non-null    object  
 11  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(3), object(4)
memory usage: 65.5+ KB


In [88]:
numeric_features = df.drop('survived', axis=1).select_dtypes(include=np.number).columns
numeric_features

Index(['age', 'sibsp', 'parch', 'fare'], dtype='object')

In [69]:
ordinal_features = df.select_dtypes(include='category').columns
ordinal_features

Index(['class'], dtype='object')

In [70]:
categorical_features = df.select_dtypes(include='object').columns
categorical_features

Index(['sex', 'embarked', 'who', 'embark_town'], dtype='object')

In [None]:
# Split data
from sklearn.model_selection import train_test_split
X = df.drop('survived', axis=1) 
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Preprocessing steps
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features_pipeline = Pipeline([
    ('impute_num', SimpleImputer(strategy='mean')),
    ('std', StandardScaler())
])

categorical_features_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='unknow')),
    ('onehot', OneHotEncoder())
])

ordinal_features_pipeline = Pipeline([
    ('ord_imputer', SimpleImputer(strategy='constant', fill_value='unknow')),
    ('ord_enc', OrdinalEncoder())
])

ct_pipeline = ColumnTransformer([
    ('numeric_transformer', numeric_features_pipeline, numeric_features),
    ('categorical_transformer', categorical_features_pipeline, categorical_features),
    ('ordinal_transformer', ordinal_features_pipeline , ordinal_features),
])

### __B)__ Utilizando o modelo _Random Forest_, faça a predição dos sobreviventes seperando a base de treino e teste na proporção 80%/20%.

In [377]:
from sklearn.ensemble import RandomForestClassifier

# Build RandomForest pipeline and fit model
rf_pipeline = Pipeline([
    ('preprocess', ct_pipeline),
    ('rf', RandomForestClassifier(random_state=42))
]).fit(X_train, y_train)

In [378]:
# Make predictions
y_pred_train, y_pred_test = rf_pipeline.predict(X_train), rf_pipeline.predict(X_test)
y_proba_train, y_proba_test = rf_pipeline.predict_proba(X_train), rf_pipeline.predict_proba(X_test)

In [379]:
# Display Metrics
metrics_report(y_train, y_test, y_pred_train, y_pred_test, y_proba_train, y_proba_test)

Unnamed: 0_level_0,Train dataset,Train dataset,Train dataset,Train dataset,Test dataset,Test dataset,Test dataset,Test dataset
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.976,0.993,0.984,444.0,0.841,0.857,0.849,105.0
1,0.988,0.959,0.973,268.0,0.792,0.77,0.781,74.0
accuracy,0.98,0.98,0.98,0.98,0.821,0.821,0.821,0.821
macro avg,0.982,0.976,0.979,712.0,0.816,0.814,0.815,179.0
weighted avg,0.98,0.98,0.98,712.0,0.821,0.821,0.821,179.0


### __C)__ Treine um modelo _Perceptron_ para a classificação dos sobreviventes na mesma proporção (80/20%).

In [380]:
from sklearn.linear_model import Perceptron

# Build RandomForest pipeline and fit model
perc_pipeline = Pipeline([
    ('preprocess', ct_pipeline),
    ('rf', Perceptron(random_state=42))
]).fit(X_train, y_train)

In [384]:
# Make predictions
y_pred_train, y_pred_test = perc_pipeline.predict(X_train), perc_pipeline.predict(X_test)

In [386]:
# Display Metrics
metrics_report(y_train, y_test, y_pred_train, y_pred_test, plot_roc=False, plot_pr=False)

Unnamed: 0_level_0,Train dataset,Train dataset,Train dataset,Train dataset,Test dataset,Test dataset,Test dataset,Test dataset
Unnamed: 0_level_1,precision,recall,f1-score,support,precision,recall,f1-score,support
0,0.853,0.743,0.794,444.0,0.857,0.743,0.796,105.0
1,0.649,0.787,0.712,268.0,0.693,0.824,0.753,74.0
accuracy,0.76,0.76,0.76,0.76,0.777,0.777,0.777,0.777
macro avg,0.751,0.765,0.753,712.0,0.775,0.784,0.775,179.0
weighted avg,0.776,0.76,0.763,712.0,0.789,0.777,0.778,179.0


### __D)__ Treine uma rede neural simples com as seguintes características:
- Uma camada oculta com 18 neurônios e ativação ReLU;
- Uma camada oculta com 50 neurônios e ativação ReLU;
- Uma camada oculta com 5 neurônios e ativação ReLU;
- Camada de Saída com ativação Sigmoíde;  
Utilize como otimizador o `adam`, função de perda o `binary_corssentropy` e me´trica de acompanhamento a acurácia.

### __E)__ Qual o modelo teve o melhor desempenho?

In [None]:
## 