In [None]:
import pandas as pd
import torch
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from fastai.tabular.all import *
import torch.nn as nn
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
if torch.backends.mps.is_available():
    selected_device = torch.device("mps")
    print("Using Apple Silicon")
else:
    selected_device = torch.device("cpu")
    print("MPS not available, using CPU")

In [None]:
engine = create_engine("postgresql+psycopg2://admin:admin@localhost:5432/SYAS")
sql_query = text("SELECT * FROM matches_values")
with engine.connect() as conn:
    df = pd.read_sql(sql_query, conn)
print(df.head())

In [None]:
for col in df.columns.copy():
    if col.endswith('list'):
        df[col] = df[col].str.replace(' ', '')
        df[col] = df[col].replace('', np.nan) 
        dummy_columns = df[col].str.get_dummies(sep=';').astype(int).add_prefix(col + '_')
        print(col)
        df = pd.concat([df.drop(col, axis=1), dummy_columns], axis=1)


In [None]:
cont_names = df.select_dtypes(include='number').columns.to_list()
not_one_hot = [name for name in cont_names if name.find('list') == -1]
not_one_hot

In [None]:
dep_var = 'match_status'
cat_names = [col for col in df.columns if col not in cont_names + [dep_var]]
cat_names

In [None]:
class SelectiveNormalize(TabularProc):
    order = 20 # Run after FillMissing
    
    def __init__(self, norm_cols):
        self.norm_cols = norm_cols

    def setups(self, to:Tabular):
        self.means = to.train[self.norm_cols].mean()
        self.stds = to.train[self.norm_cols].std(ddof=0) + 1e-7

    def encodes(self, to:Tabular):
        to.conts.loc[:, self.norm_cols] = (to.conts[self.norm_cols] - self.means) / self.stds
        return to

    def decodes(self, to:Tabular):
        conts = to.conts.copy()
        conts.loc[:, self.norm_cols] = (conts[self.norm_cols] * self.stds) + self.means
        to.conts = conts
        return to

In [None]:
procs = [Categorify, FillMissing, SelectiveNormalize(not_one_hot)]
dls = TabularDataLoaders.from_df(
    df,
    path='.',
    procs=procs,
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=dep_var,
    valid_pct=0.2,
    seed=42,
    device=selected_device,
    y_block=CategoryBlock
)

In [None]:
dls.vocab

In [None]:
dls.show_batch()

In [None]:
train_y = dls.train_ds.items['match_status']
counts = Counter(train_y)
num_classes = len(counts)
total_samples = sum(counts.values())
weights = []
for i in range(num_classes):
    weight = total_samples / (num_classes * counts[i])
    weights.append(weight)
class_weights = torch.tensor(weights, dtype=torch.float32).to(dls.device)
# manual_weights = torch.tensor([25.0, 0.54], dtype=torch.float32).to(dls.device)
weighted_loss_func = nn.CrossEntropyLoss(weight=class_weights)
def squeezed_loss_func(preds, targs, **kwargs):
    # Target tensor has too many dimensions
    return weighted_loss_func(preds, targs.squeeze(), **kwargs)
print(f"Calculated Weights (for class 0, then 1): {class_weights}")

In [None]:
learn = tabular_learner(dls, metrics=F1Score(pos_label=0), loss_func=squeezed_loss_func)
suggestions = learn.lr_find(suggest_funcs=(valley, slide))

suggestions

In [None]:
learn.fit_one_cycle(2, lr_max=0.001445, wd=.1, cbs= [
    EarlyStoppingCallback(monitor='f1_score', patience=2),
    SaveModelCallback(monitor='f1_score')
])

In [None]:
preds, targs = learn.get_preds(ds_idx=1)
predicted_classes = preds.argmax(dim=1)
cm = confusion_matrix(targs, predicted_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dls.vocab)
disp.plot(cmap=plt.cm.Blues)
plt.show()