# Problema 2: Zona de destino (Clasificación múltiple)

#### Carga de librerías

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from keras.wrappers.scikit_learn import KerasClassifier

#### Configuración de figuras

In [None]:
plt.style.use('seaborn-whitegrid')
plt.rc('figure', autolayout=True)

plt.rc('font', size=10)          # controls default text sizes (?)
plt.rc('axes', titlesize=20)     # fontsize of the graph title
plt.rc('axes', labelsize=24)     # fontsize of the x and y axes titles
plt.rc('xtick', labelsize=17.5)    # fontsize of the tick labels
plt.rc('ytick', labelsize=17.5)    # fontsize of the tick labels
plt.rc('legend', fontsize=22.5)    # legend fontsize
plt.rc('figure', titlesize=50)
plt.rc('animation', html='html5')

#### Carga de datos

In [None]:
df = pd.read_csv('2_train.csv', index_col=0)

df_test = pd.read_csv('2_test.csv', index_col=0)

df.head()

In [None]:
df.destination_station_num -= 1
df_test.destination_station_num -= 1

df.destination_group -= 1
df_test.destination_group -= 1

#### Procesamiento de datos para el ingreso a la red neuronal

In [None]:
X = df.copy()
y = X.pop('destination_group')

X_test = df_test.copy()
y_test = X_test.pop('destination_group')

features_num = ['origin_time', 'n_viajes'] + ['s'+str(i) for i in range(1,73)]
features_cat = ['origin_date_name', 'origin_month', 'origin_station_num']

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # manejo de valores faltantes
    StandardScaler(),                   # estandarización
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"), # manejo de valores faltantes
    OneHotEncoder(handle_unknown='ignore', sparse = True),
)
preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

# Partición aleatoria en entrenamiento (75%) y validación (25%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, random_state=22, shuffle=False)

print('shape = ', X_train.shape)
print('valid = ', X_valid.shape)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test = preprocessor.transform(X_test)

print('shape = ', X_train.shape)
print('valid = ', X_valid.shape)

input_shape = [X_train.shape[1]]
print("Input shape: {}".format(input_shape))

#### Grid Search

In [None]:
# Función con hiperparámetros a optimizar
def create_model(optimizer='adam', activation = 'sigmoid', hidden_layers=2, hidden_size=16):
  # Inicializar el constructor
    model = keras.Sequential()
      # Capa de entrada
    model.add(layers.BatchNormalization(input_shape = input_shape))

    if activation == 'relu':
      initializer = 'he_normal'
    else:
      initializer = 'glorot_normal'

    for i in range(hidden_layers):
        # Capa oculta
        model.add(layers.Dense(hidden_size, activation=activation, kernel_initializer=initializer))

      # Capa de salida 
    model.add(layers.Dense(8, activation='softmax'))
      # Compilar el modelo
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Modelo a utilizar como input de la función GridSearchCV
modelCV = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
# Diccionario con grilla de valores de hiperparámetros
param_grid = dict(
    hidden_layers = [1, 2], 
    hidden_size = [32, 64, 128], 
    activation = ['relu', 'tanh'], 
    optimizer = ['rmsprop', 'sgd', 'adam'], 
    batch_size = [10**4], 
    epochs = [10]
)

# Implementación de grid search
grid = GridSearchCV(estimator=modelCV, param_grid=param_grid, scoring='accuracy')
grid_result = grid.fit(X_train, y_train)

In [None]:
# Imprimir resultados
print('Best accuracy:', grid_result.best_score_)
grid_result.best_params_

#### Entrenamiento de red

In [None]:
y_train = keras.utils.to_categorical(y_train, 8)
y_valid = keras.utils.to_categorical(y_valid, 8)
y_test = keras.utils.to_categorical(y_test, 8)

In [None]:
# Definir el modelo
model = keras.Sequential([
    layers.BatchNormalization(input_shape = input_shape),
    
    layers.Dense(128, activation = 'tanh', kernel_initializer='glorot_normal'),
    layers.BatchNormalization(),
    
    layers.Dense(128, activation = 'tanh', kernel_initializer='glorot_normal'),
    layers.BatchNormalization(),
    
    layers.Dense(8, activation = 'softmax')
])

# Compilar el modelo
model.compile(optimizer = 'rmsprop', 
       loss = keras.losses.CategoricalCrossentropy(),
       metrics = [keras.metrics.CategoricalAccuracy()])

In [None]:
# Detención temprana
early_stopping = keras.callbacks.EarlyStopping(
    patience=20,
    min_delta=0.01,
    restore_best_weights=True,
)

# Entrenar el modelo
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=10**4,
    epochs=100,
    callbacks=[early_stopping],
    verbose = 0
)

#### Guardar modelo

In [None]:
model.save("model2.h5")

#### Cargar modelo

In [None]:
from keras.models import load_model
model = load_model("model2.h5")

#### Respuesta predicha

In [None]:
y_pred_continuous = model.predict(X_test)    
y_pred = np.zeros_like(y_pred_continuous)
y_pred[np.arange(len(y_pred_continuous)), y_pred_continuous.argmax(1)] = 1  

# Precisión
accuracy_score(y_test, y_pred)

#### Matriz de confusión

In [None]:
# Nro. de observaciones en cada celda del heatmap

y_test_num = [x.argmax() for x in y_test]
y_pred_num = [x.argmax() for x in y_pred]

y_tup = [i for i in zip(y_test_num, y_pred_num)]

n = []
for i in range(8):
  ls = []
  for j in range(8):
    num = len([x for x in y_tup if x[0]==i and x[1]==j])
    txt = '(n=' + str(num) + ')'
    ls.append(txt)
  n.append(ls)

In [None]:
cf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1), normalize='true')

fig, ax = plt.subplots(figsize=(12,9))

sns.heatmap(cf_matrix, annot=cf_matrix, annot_kws={'size': 18, 'va': 'bottom'}, fmt='.0%', cmap='Blues', vmin=0, vmax=1, cbar=False)
sns.heatmap(cf_matrix, annot=n, annot_kws={'size': 18, 'va': 'top'}, fmt="", cmap='Blues', vmin=0, vmax=1, cbar=False)

plt.yticks(rotation=0) 

ax.set_xlabel('Zona predicha')
ax.set_ylabel('Zona observada')

ax.set_xticklabels(range(1, 9, 1))
ax.set_yticklabels([str(x)+' ' for x in range(1, 9, 1)])