In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import matplotlib.ticker as mtick
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibrationDisplay
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import pickle

sns.set()

import notebook_util as n_util
from notebook_util import *

In [2]:
# Import comet_ml at the top of your file
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment(
    api_key="UGYDiy3HENiE7Y3dqoMAVIgG2",
    project_name="custom-models",
    workspace="ift6758a-a22-g3-projet",
)

# Report multiple hyperparameters using a dictionary:
hyper_params = {
    "learning_rate": 0.0003,
    "batch_size": 100,
    "num_epochs": 25,
    "momentum": 0.5,
}
experiment.log_parameters(hyper_params)


COMET INFO: Experiment is live on comet.com https://www.comet.com/ift6758a-a22-g3-projet/custom-models/d4e108382dd04f3dbce68a425a4c6486



Prep Data

In [3]:
df_filtered = prep_data()

df_filtered = prep_dummie(df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['emptyNet'] = df_filtered['emptyNet'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['strength'] = df_filtered['strength'].fillna('Even')


In [4]:
train, test = train_test_split(df_filtered, test_size=0.33, random_state=42)
print(train.shape)
print(test.shape)

(206814, 36)
(101864, 36)


In [5]:
x_train = train[n_util.feature]#.to_numpy().reshape(-1, columns_count)

y_train = train['isGoal']#.to_numpy()

x_train, y_train = RandomOverSampler().fit_resample(x_train, y_train)

x_test = test[n_util.feature]#.to_numpy().reshape(-1, columns_count)

y_test = test['isGoal']#.to_numpy()

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(35, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)

        x = self.fc2(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)

        x = self.fc3(x)
        x = F.sigmoid(x)
        
        return x
    
net = Net()

In [7]:
batch_size = hyper_params["batch_size"]
num_epochs = hyper_params["num_epochs"]
learning_rate = hyper_params["learning_rate"]
momentum = hyper_params["momentum"]
batch_no = len(x_train) // batch_size

criterion = nn.BCELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)

In [8]:
x_train_t = torch.tensor(x_train.to_numpy(dtype=np.float32), dtype = torch.float32)
y_train_t = torch.tensor(y_train.values, dtype = torch.float32)

x_test_t = torch.tensor(x_test.to_numpy(dtype=np.float32), dtype = torch.float32)
y_test_t = torch.tensor(y_test.values, dtype = torch.float32)

In [9]:
from sklearn.utils import shuffle
from torch.autograd import Variable

for epoch in range(num_epochs):
    if epoch % 5 == 0:
        print('Epoch {}'.format(epoch+1))
    x_train_t, y_train_t = shuffle(x_train_t, y_train_t)
    # Mini batch learning
    loss_sum = 0
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(x_train_t[start:end]))
        y_var = Variable(torch.FloatTensor(y_train_t[start:end]))
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        ypred_var = net(x_var)
        loss =criterion(ypred_var, y_var[:,None])
        loss.backward()
        loss_sum = loss_sum + loss
        #print(loss)
        optimizer.step()
    print('Loss Sum: ', loss_sum/batch_no)

Epoch 1




Loss Sum:  tensor(3.3487, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6904, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6888, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6872, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6853, grad_fn=<DivBackward0>)
Epoch 6
Loss Sum:  tensor(0.6830, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6807, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6781, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6763, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6737, grad_fn=<DivBackward0>)
Epoch 11
Loss Sum:  tensor(0.6709, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6672, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6641, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6621, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6601, grad_fn=<DivBackward0>)
Epoch 16
Loss Sum:  tensor(0.6584, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6567, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6554, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6536, grad_fn=<DivBackward0>)
Loss Sum:  tensor(0.6526

In [10]:
# Evaluate the model
test_var = Variable(torch.FloatTensor(x_test_t), requires_grad=True)
with torch.no_grad():
    result = net(test_var)
values = torch.round(result[:, 0])

num_right = np.sum(values.data.numpy().astype(int) == y_test)
print('Num Right', num_right)
accuracy = num_right / len(y_test_t)
print('Accuracy {:.2f}'.format(accuracy))

Num Right 59436
Accuracy 0.58


In [11]:
from torchmetrics import ConfusionMatrix
from torchmetrics.classification import BinaryF1Score

target_m = torch.tensor(y_test.to_numpy()).to(torch.int)
pred_m = torch.tensor(values).to(torch.int)

confmat = ConfusionMatrix(num_classes=2)
confmat(target_m, pred_m)

  pred_m = torch.tensor(values).to(torch.int)


tensor([[52970,  2744],
        [39684,  6466]])

In [12]:
metric = BinaryF1Score()
print('F1 Pytorch')
f1 = metric(result[:, 0], target_m).item()
print(f1)
from sklearn.metrics import f1_score
f1 = f1_score(target_m.numpy(), values.numpy().astype(int), average='macro')
print('F1 Macro')
print(f1)

F1 Pytorch
0.2335982620716095
F1 Macro
0.47381681870231745


In [13]:
pickle.dump(net, open("../models/MLP2.sav", 'wb'))
experiment.log_model("MLP2", "../models/MLP2.sav")
experiment.log_metric("f1", f1)
experiment.log_metric("accuracy", accuracy)
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/ift6758a-a22-g3-projet/custom-models/d4e108382dd04f3dbce68a425a4c6486
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     accuracy    : 0.5834838608340532
COMET INFO:     f1          : 0.47381681870231745
COMET INFO:     loss [9388] : (0.5503088235855103, 58.038204193115234)
COMET INFO:   Parameters:
COMET INFO:     batch_size    : 100
COMET INFO:     learning_rate : 0.0003
COMET INFO:     momentum      : 0.5
COMET INFO:     num_epochs    : 25
COMET INFO:   Uploads:
COMET INFO:     conda-environment-definition : 1
COMET INFO:     conda-info                   : 1
COMET INFO:     conda-specification          : 1
COMET INFO:     environment details          : 1
COMET INFO:     filename                     : 1
COMET INFO:     git metadata      