In [61]:
# Use with NHL_Model_Data_Transform_v5.py
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras


rng = 69

In [63]:
#df = pd.read_csv(r"C:\Users\zchodan\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Over_Under_Transformed.csv")
df = pd.read_csv(r"C:\Users\zchodan\OneDrive - Franklin Templeton\Documents\Python\Kaggle\Insurance Premiums\Transformed.csv")

In [65]:
from sklearn.model_selection import train_test_split

# Split into 80% training data and 20% testing data

X = df.drop('Premium Amount', axis=1)
Y = df['Premium Amount'].copy()

X_train_initial, X_test, Y_train_initial, Y_test = train_test_split(X, Y, test_size=0.2, random_state=rng)

# Now split the training data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_initial, Y_train_initial, test_size=0.25, random_state=42)


Y_log_train = np.log1p(Y_train)
Y_log_val = np.log1p(Y_val)
Y_log_test = np.log1p(Y_test)


In [67]:
# Pipeline constructor used to run transformation steps in order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

X_train_prepared= num_pipeline.fit_transform(X_train)
X_valid_prepared= num_pipeline.fit_transform(X_val)
X_test_prepared= num_pipeline.fit_transform(X_test)


In [69]:
from keras import backend as K
from sklearn.metrics import mean_squared_log_error

#Define a custom RMSLE metric
def rmsle(y_true, y_pred):  
    # To avoid log(0), clip values to a small positive constant (epsilon)
    epsilon = tf.keras.backend.epsilon()

    # Apply log transformation
    #y_true_log = tf.math.log1p(y_true)  # Log-transform the true values
    #y_pred_log = tf.math.log1p(y_pred)  # Log-transform the clipped predicted values
    
    # RMSLE formula
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [71]:
n_epoch = 5

In [73]:
def build_model(n_neurons=50, n_hidden=3, dropout_rate=0.4, learning_rate=1e-3, input_shape=[X_train.shape[1]]):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(shape=input_shape))
    # Hidden layer with droupout
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons,activation='relu'))
        model.add(keras.layers.Dropout(dropout_rate))
    # Output layer    
    model.add(keras.layers.Dense(1, activation='relu'))
    
    # Compile the model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,loss='mean_squared_logarithmic_error',metrics=[rmsle])
    return model

In [75]:
from scikeras.wrappers import KerasRegressor

# Seems like you need to specify some items again when using the scikeras wrapper
keras_reg = KerasRegressor(model=build_model, n_hidden=1, n_neurons=30, learning_rate=3e-3)


In [77]:
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    'model__n_hidden': [0,1,2,3,4,5],
    'model__dropout_rate': [0,.1,.2,.3,.4],
    'model__n_neurons': np.arange(10,60),
    'model__learning_rate' : reciprocal(1e-5, 1e-2)
}

rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=2, cv=2, scoring='neg_root_mean_squared_log_error',
                                   error_score = 'raise')

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True)
rnd_search_cv.fit(X_train_prepared,Y_log_train, epochs=n_epoch,
                 validation_data = (X_valid_prepared, Y_log_val),
                 callbacks=[early_stopping_cb])

## Loss = MSLE score for training data
## rmsle = Difference between true and predicted value on a log scale for training data
## val_loss = MLSE for validation data, lower than training loss indicates performing well
## val_rmsle = Difference between true and predicted value on a log scale for validation data

Epoch 1/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.2402 - rmsle: 2.2208 - val_loss: 0.0439 - val_rmsle: 1.2531
Epoch 2/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0565 - rmsle: 1.4684 - val_loss: 0.0439 - val_rmsle: 1.2484
Epoch 3/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.0481 - rmsle: 1.3340 - val_loss: 0.0439 - val_rmsle: 1.2548
Epoch 4/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.0461 - rmsle: 1.2859 - val_loss: 0.0439 - val_rmsle: 1.2526
Epoch 5/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0442 - rmsle: 1.2586 - val_loss: 0.0439 - val_rmsle: 1.2474
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 914us/step
Epoch 1/5
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.1196 - rmsle: 1.9545 - val_loss: 0.043

In [79]:
print(rnd_search_cv.best_params_)
print(rnd_search_cv.best_score_)

{'model__dropout_rate': 0.2, 'model__learning_rate': 0.0026216795774944017, 'model__n_hidden': 5, 'model__n_neurons': 28}
-0.20950393528902916


In [29]:
# print(rnd_search_cv.best_params_)
# print(rnd_search_cv.best_score_)

{'model__dropout_rate': 0.2, 'model__learning_rate': 0.00013539892924516292, 'model__n_hidden': 3, 'model__n_neurons': 20}
-1.2623520857805182


In [81]:
best_model = rnd_search_cv.best_estimator_.model

In [83]:
from scikeras.wrappers import KerasRegressor

# Seems like you need to specify some items again when using the scikeras wrapper
keras_reg = KerasRegressor(model=best_model, epochs=n_epoch, batch_size=5, verbose=1, random_state=42,
                           loss='mean_squared_logarithmic_error',optimizer=keras.optimizers.Adam(),metrics=[rmsle])

# Stop early if model is not getting better after # of patience epochs, restore to best model
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)
#checkpoint_cb = keras.callbacks.ModelCheckpoint("Regression_Sequential.keras", save_best_only = True)

keras_reg.fit(X_train_prepared, Y_log_train, validation_data = (X_valid_prepared, Y_log_valid),
             callbacks=[early_stopping_cb])


Epoch 1/5
[1m41171/41171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 2ms/step - loss: 0.0812 - rmsle: 1.6245 - val_loss: 0.0440 - val_rmsle: 1.2046
Epoch 2/5
[1m41171/41171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2ms/step - loss: 0.0457 - rmsle: 1.2210 - val_loss: 0.0439 - val_rmsle: 1.1950
Epoch 3/5
[1m41171/41171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1ms/step - loss: 0.0446 - rmsle: 1.1978 - val_loss: 0.0439 - val_rmsle: 1.1837
Epoch 4/5
[1m41171/41171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1ms/step - loss: 0.0444 - rmsle: 1.1915 - val_loss: 0.0439 - val_rmsle: 1.1848
Epoch 5/5
[1m41171/41171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - loss: 0.0444 - rmsle: 1.1905 - val_loss: 0.0439 - val_rmsle: 1.1847


In [85]:
from sklearn.model_selection import cross_val_score

# Assuming X and y are your feature matrix and target vector
scores = cross_val_score(keras_reg, X_train_prepared, Y_train, cv=5, scoring='neg_root_mean_squared_log_error',verbose=0)

# Convert scores to positive values
rmlse_scores = -scores
print(f'RMLSE Scores: {rmlse_scores}')
print(f'Mean RMLSE: {np.mean(rmlse_scores)}')

Epoch 1/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 958us/step - loss: 2.4158 - rmsle: 921.9337
Epoch 2/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 939us/step - loss: 1.6220 - rmsle: 891.2092
Epoch 3/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1ms/step - loss: 1.6133 - rmsle: 889.5029
Epoch 4/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1ms/step - loss: 1.6068 - rmsle: 889.0779
Epoch 5/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1ms/step - loss: 1.6037 - rmsle: 888.1721
[1m8235/8235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 800us/step
Epoch 1/5
[1m32937/32937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 1ms/step - loss: 2.4231 - rmsle: 919.9672
Epoch 2/5
[1m31834/32937[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1s[0m 1ms/step - loss: 1.6282 - rmsle: 889.5683

KeyboardInterrupt: 

In [48]:
X_train_pred = keras_reg.predict(X_train_prepared)
X_test_pred = keras_reg.predict(X_test_prepared)

# Evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(Y_train, X_train_pred)
mae = mean_absolute_error(Y_train, X_train_pred)

print(f'Train MSE: {mse}') # Sensitive to outliers
print(f'Train MAE: {mae}')

mse = mean_squared_error(Y_test, X_test_pred)
mae = mean_absolute_error(Y_test, X_test_pred)

print(f'Test MSE: {mse}') # Sensitive to outliers
print(f'Test MAE: {mae}')

Train MSE: 5.344163711089202
Train MAE: 1.8470409416088096
Test MSE: 5.315119719057888
Test MAE: 1.879280097918077


In [49]:
#df_Predict = pd.read_csv(r"C:\Users\zchodan\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Over_Under_Predict.csv")
df_Predict = pd.read_csv(r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\NHL_Data_Over_Under_Predict.csv")

In [50]:


y_pred = np.expm1(model.predict(X_test))  # Inverse of log1p (to get the original scale)

NameError: name 'keras_clf' is not defined