# Multi-layer perceptron (Deep Learning) with Sci-Kit Learn

In [None]:
from sklearn.model_selection import train_test_split
from deep_learn.metrics import compute_metrics
import pandas as pd

## Load in data

In [None]:
mlp_data = pd.read_csv('data/synthetic_income_data.csv', 
                       index_col=None)

In [None]:
from deep_learn.styler import style_dataframe as sdf

In [None]:
sdf(mlp_data.head())

In [None]:
X = mlp_data.drop(columns=['income'])
y = mlp_data['income']

## Prepare train and test set splits

In [None]:
RANDOM_SEED = 42
SPLIT_RATIO = 0.2
(X_train, X_test, 
y_train, y_test) = train_test_split(X, y, 
                                    test_size=SPLIT_RATIO, 
                                    random_state=RANDOM_SEED)

## Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Build `MLPRegressor`

In [None]:
from sklearn.neural_network import MLPRegressor # MLPClassifier for classification

In [None]:
help(MLPRegressor)

In [None]:
mlp_sklearn = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    learning_rate_init=0.001,
    max_iter=500,
    shuffle=True,
    random_state=RANDOM_SEED,
    verbose=True
)

In [None]:
mlp_sklearn.fit(X_train_scaled, y_train)

In [None]:
preds = mlp_sklearn.predict(X_test_scaled)

In [None]:
print(preds, y_test)

In [None]:
compute_metrics(y_test, preds)

## Hyperparameter tune our model for better performance

In [None]:
from sklearn.metrics import make_scorer
def mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

In [None]:
mse_scorer = make_scorer(mse, 
                          greater_is_better=False)

In [None]:
param_grid = {
    'hidden_layer_sizes': [(64, 32), (128, 64), (64, 64)],
    'alpha': [0.0001, 0.001, 0.01], 
    'learning_rate_init': [0.001, 0.01],
    'activation': ['relu', 'tanh']
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(
    estimator=MLPRegressor(
        solver='adam',
        max_iter=500,
        shuffle=True,
        random_state=RANDOM_SEED,
        verbose=False
    ),
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=3,
    n_jobs=-1
)

In [None]:
grid.fit(X_train, y_train)

## Get the best model parameters

In [None]:
import numpy as np
print("Best Params:", grid.best_params_)
print("Best Score (MSE):",grid.best_score_)

In [None]:
best_model = grid.best_estimator_
preds = best_model.predict(X_test)

In [None]:
compute_metrics(y_test, preds)

## Save best model

In [None]:
import joblib
import os
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(model_dir, 
                                     'mlp_sklearn_best_model.joblib'))


## Visualize performance

In [None]:
import matplotlib.pyplot as plt
def actual_vs_predicted_plot(y_test, y_pred, 
                             figsize=(6, 6), alpha=0.5,
                             point_col='grey', line_col='black',
                             x_label='Actual', y_label='Predicted', 
                             title='Actual vs. Predicted', show_grid=True,
                             line_style='--', line_width=1):
    plt.figure(figsize=figsize)
    plt.scatter(y_test, y_pred, alpha=alpha, color=point_col)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             color=line_col, linestyle=line_style, lw=line_width) 
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.grid(show_grid)
    plt.show()

In [None]:
actual_vs_predicted_plot(y_test, preds)

In [None]:
def dist_of_residuals(y_test, y_pred, 
                      figsize=(6, 6), alpha=0.7,
                      bins=30, hist_col='grey', edge_col='black',
                      line_col='black', line_width=1, line_style='--',
                      x_label='Residuals', y_label='Frequency',
                      title='Distribution of Residuals', 
                      show_grid=True):
    global residuals
    residuals = y_test - y_pred
    plt.figure(figsize=figsize)
    plt.hist(residuals, bins=bins, alpha=alpha, color=hist_col, edgecolor=edge_col)
    plt.axvline(x=0, color=line_col, linestyle=line_style, lw=line_width)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.grid(show_grid)
    plt.show()

In [None]:
dist_of_residuals(y_test, preds, line_width=2)

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

def qq_plot(residuals, figsize=(6, 6), 
            line_color='red', point_color='blue', 
            line_style='-', point_size=20,
            grid=True, title='Q-Q Plot of Residuals',
            font_size=12):
    plt.figure(figsize=figsize)
    (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm")

    # Plot the Q-Q line
    plt.plot(osm, slope * np.array(osm) + intercept, line_style, color=line_color, label='Q-Q Line')

    # Plot the actual residuals
    plt.scatter(osm, osr, color=point_color, s=point_size, alpha=0.6, label='Residuals')

    plt.title(title, fontsize=font_size + 2)
    plt.xlabel("Theoretical Quantiles", fontsize=font_size)
    plt.ylabel("Ordered Values", fontsize=font_size)
    if grid:
        plt.grid(True, linestyle='--', alpha=0.5)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
qq_plot(residuals, point_color='grey', 
        line_color='black', 
        line_style='--', 
        point_size=50)

## Save visualization methods to module

In [None]:
%%writefile deep_learn/regression_viz.py
import scipy.stats as stats
import matplotlib.pyplot as plt

def dist_of_residuals(y_test, y_pred, 
                      figsize=(6, 6), alpha=0.7,
                      bins=30, hist_col='grey', edge_col='black',
                      line_col='black', line_width=1, line_style='--',
                      x_label='Residuals', y_label='Frequency',
                      title='Distribution of Residuals', 
                      show_grid=True):
    global residuals
    residuals = y_test - y_pred
    plt.figure(figsize=figsize)
    plt.hist(residuals, bins=bins, alpha=alpha, color=hist_col, edgecolor=edge_col)
    plt.axvline(x=0, color=line_col, linestyle=line_style, lw=line_width)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.grid(show_grid)
    plt.show()

def actual_vs_predicted_plot(y_test, y_pred, 
                             figsize=(6, 6), alpha=0.5,
                             point_col='grey', line_col='black',
                             x_label='Actual', y_label='Predicted', 
                             title='Actual vs. Predicted', show_grid=True,
                             line_style='--', line_width=1):
    plt.figure(figsize=figsize)
    plt.scatter(y_test, y_pred, alpha=alpha, color=point_col)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             color=line_col, linestyle=line_style, lw=line_width) 
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.grid(show_grid)
    plt.show()

def qq_plot(residuals, figsize=(6, 6), 
            line_color='red', point_color='blue', 
            line_style='-', point_size=20,
            grid=True, title='Q-Q Plot of Residuals',
            font_size=12):
    plt.figure(figsize=figsize)
    (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm")

    # Plot the Q-Q line
    plt.plot(osm, slope * np.array(osm) + intercept, line_style, color=line_color, label='Q-Q Line')

    # Plot the actual residuals
    plt.scatter(osm, osr, color=point_color, s=point_size, alpha=0.6, label='Residuals')

    plt.title(title, fontsize=font_size + 2)
    plt.xlabel("Theoretical Quantiles", fontsize=font_size)
    plt.ylabel("Ordered Values", fontsize=font_size)
    if grid:
        plt.grid(True, linestyle='--', alpha=0.5)
    plt.legend()
    plt.tight_layout()
    plt.show()