<a href="https://colab.research.google.com/github/asvskartheek/CS-F320/blob/master/training_notebooks/Training_Zivy_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q datasets setfit optuna

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from setfit import SetFitModel, TrainingArguments, Trainer, SetFitTrainer
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TEST_SIZE = 500
SEED = 0

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Zivy Files/labelled_17_apr.csv") #18 apr, 4.19pm - 3203 dataset
df.dropna(subset=['category'], how='all', inplace=True) # these nan's are coming from skipped labelling messages
df = df[['body', 'category', 'channel_name', 'previous_messages', 'future_messages']]
df.head()

In [None]:
df['category'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
# Stage 1 data preparation
df_stage1 = df.copy()
df_stage1['important'] = df_stage1['category'].apply(lambda x: 1 if x in ['FYI', 'Action Items'] else 0)
df_train, df_s1_test = train_test_split(df_stage1, test_size=500, stratify=df_stage1['category'], random_state=0)
df_s1_train, df_s1_val = train_test_split(df_train, test_size=0.1, stratify=df_train['important'], random_state=0)

# Stage 2 data preparation
df_stage2 = df_train[df_train['category'].isin(['FYI', 'Action Items'])]
df_s2_train, df_s2_val = train_test_split(df_stage2, test_size=0.1, stratify=df_stage2['category'], random_state=0)

len(df_s1_train), len(df_s1_val), len(df_s1_test), len(df_s2_train), len(df_s2_val) # S1 - train, val & test, S2 - train, val

## Stage 1

In [None]:
df_s1_train['category'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
df_s1_train['important'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
df_s1_test['important'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
df_s1_test['category'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
df_s1_test['important'].value_counts().sort_index().plot(kind='bar', rot=0, ylabel='count')

In [None]:
s1_ds = {
    'train': Dataset.from_pandas(df_s1_train, preserve_index=False),
    'val': Dataset.from_pandas(df_s1_val, preserve_index=False),
    'test': Dataset.from_pandas(df_s1_test, preserve_index=False)
}

In [None]:
s1_ds = {
    'train': Dataset.from_pandas(df_s1_train, preserve_index=False),
    'val': Dataset.from_pandas(df_s1_val, preserve_index=False),
    'test': Dataset.from_pandas(df_s1_test, preserve_index=False)
}

s1_model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                       labels=["Important", "Not Important"],
                                       device=DEVICE)
s1_args = TrainingArguments(
    num_iterations=20, # meaning?
    seed=0,
    eval_steps=500, # eval step -> 100
    load_best_model_at_end=True,
    num_epochs=(2, 16), # hpt on epochs on embedding stage - 2, 4, 8
    batch_size=(32, 32) # emb batch size, classification stage batch size.
)
s1_trainer = Trainer(
    model=s1_model,
    args=s1_args,
    train_dataset=s1_ds['train'],
    eval_dataset=s1_ds['val'],
    column_mapping={'body': 'text', 'important': 'label'}
)
s1_trainer.train()

In [None]:
# Load trained model
import joblib
s1_model = joblib.load('/content/drive/MyDrive/Zivy Files/s1_model_24_apr.model')

s1_ds = {
    'train': Dataset.from_pandas(df_s1_train, preserve_index=False),
    'val': Dataset.from_pandas(df_s1_val, preserve_index=False),
    'test': Dataset.from_pandas(df_s1_test, preserve_index=False)
}

In [None]:
# On validation split
import plotly.express as px

prob_preds = s1_model.predict_proba(s1_ds['val']['body'])[:,1]
true_labels = s1_ds['val']['important']

fpr, tpr, thresholds = roc_curve(true_labels, prob_preds)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=prob_preds, color=true_labels, nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()


# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [None]:
# On test split
s1_threshold = 0.1875164  # threshold chosen from ROC
prob_preds = s1_model.predict_proba(s1_ds['test']['body'])[:,1]
true_labels = s1_ds['test']['important']
binary_preds = (prob_preds.numpy() >= s1_threshold).astype(int)

report = classification_report(true_labels, binary_preds)

print(f"\n--- CLASSIFICATION REPORT ---\n{report}")
cf_matrix = confusion_matrix(true_labels, binary_preds)
print("\n --- CONFUSION MATRIX ---\n")
disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=["Not Important", "Important"])
disp.plot()

#### Hyper-parameter Search

In [None]:
def hyperparameter_search_s1_function(trial):
    return {
        "num_epochs": trial.suggest_categorical("num_epochs", [2, 4, 8])
    }

In [None]:
def make_model(params=None):
    return SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                       labels=["Important", "Not Important"],
                                       device=DEVICE)

In [None]:
s1_model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                       labels=["Important", "Not Important"],
                                       device=DEVICE)
s1_args = TrainingArguments(
    num_iterations=20, # meaning?
    seed=0,
    eval_steps=100, # eval step -> 100
    load_best_model_at_end=True,
    num_epochs=8,
    batch_size=(32, 32) # emb batch size, classification stage batch size.
)
s1_trainer = Trainer(
    model=s1_model,
    args=s1_args,
    train_dataset=s1_ds['train'],
    eval_dataset=s1_ds['val'],
    column_mapping={'body': 'text', 'important': 'label'}
)

In [None]:
s1_trainer.train()

In [None]:
from optuna.visualization.matplotlib import plot_param_importances

plot_param_importances(best.backend);

## Stage 2

In [None]:
s2_ds = {
    'train': Dataset.from_pandas(df_s2_train, preserve_index=False),
    'val': Dataset.from_pandas(df_s2_val, preserve_index=False)
}

s2_model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2",
                                       labels=["Action Items", "FYI"],
                                       device=DEVICE)
s2_args = TrainingArguments(
    num_iterations=20,
    seed=0,
    evaluation_strategy='steps',
    eval_steps=500,
    load_best_model_at_end=True,
    num_epochs=(2, 16),
    batch_size=(32, 32) # emb batch size, classification stage batch size.
)
s2_trainer = Trainer(
    model=s2_model,
    args=s2_args,
    train_dataset=s2_ds['train'],
    eval_dataset=s2_ds['val'],
    column_mapping={'body': 'text', 'category': 'label'}
)

s2_trainer.train()

In [None]:
import joblib
s2_model = joblib.load('/content/drive/MyDrive/Zivy Files/s2_model_24_apr.model')

s2_ds = {
    'train': Dataset.from_pandas(df_s2_train, preserve_index=False),
    'val': Dataset.from_pandas(df_s2_val, preserve_index=False)
}

In [None]:
prob_preds = s2_model.predict_proba(s2_ds['val']['body'])[:,1]

In [None]:
df_s2_val['prob_pred'] = prob_preds

In [None]:
df_s2_val[(df_s2_val['category'] == "Action Items") & (df_s2_val['prob_pred']>0.8)]

In [None]:
prob_preds = s2_model.predict_proba(s2_ds['val']['body'])[:,1] # getting 0 for action items' probability
true_labels = s2_ds['val']['category']
mapping = {'Action Items': 0, 'FYI': 1}
mapped_list = [mapping[item] for item in true_labels]

fpr, tpr, thresholds = roc_curve(mapped_list, prob_preds)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=prob_preds, color=mapped_list, nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()


# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [None]:
s2_threshold = 0.1579706  # threshold chosen from ROC
# try different thresholds to increase recall of action items.
prob_preds = s2_model.predict_proba(s2_ds['val']['body'])[:,1] # getting 0 for action items' probability
true_labels = s2_ds['val']['category']
mapping = {'Action Items': 0, 'FYI': 1}
mapped_list = [mapping[item] for item in true_labels]
binary_preds = (prob_preds.numpy() >= s2_threshold).astype(int)
report = classification_report(mapped_list, binary_preds)

print(f"\n--- CLASSIFICATION REPORT ---\n{report}")
cf_matrix = confusion_matrix(mapped_list, binary_preds)
print("\n --- CONFUSION MATRIX ---\n")
disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=["Action Items", "FYI"])
disp.plot()

## Ensemble

In [None]:
def ensemble_predictor(message_body):
    s1_imp_proba_pred = s1_model.predict_proba([message_body]).numpy()[0][1]
    if s1_imp_proba_pred >= s1_threshold:
        s2_ai_proba_pred = s2_model.predict_proba([message_body]).numpy()[0][1]

        if s2_ai_proba_pred >= s2_threshold:
            return 'FYI'
        else:
            return 'Action Items'
    else:
        return 'Not Important'

In [None]:
y_pred = []
y_true = df_s1_test['category'].values

for i, row in df_s1_test.iterrows():
    y_pred.append(ensemble_predictor(row['body']))

report = classification_report(y_true, y_pred)

print(f"\n--- CLASSIFICATION REPORT ---\n{report}")
cf_matrix = confusion_matrix(y_true, y_pred)
print("\n --- CONFUSION MATRIX ---\n")
disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=["Action Items", "FYI", "Not Important"])
disp.plot()

In [None]:
# Performance on bot messages
eval_df_bot = df_s1_test[df_s1_test['channel_name'].isin(['deployments', 'form-publish-alerts',
                'form-generated-alerts', 'shortcut-updates',
                'demo-form-filled-alerts', 'sentry-errors', 'watchdog_engine',
                'aws-updates', 'ph-reviews', 'new-user-alerts', 'dc-sentry', 'dc-alerts'])]

y_pred = []
y_true = eval_df_bot['category'].values

for i, row in eval_df_bot.iterrows():
    y_pred.append(ensemble_predictor(row['body']))

report = classification_report(y_true, y_pred)

print(f"\n--- CLASSIFICATION REPORT ---\n{report}")
cf_matrix = confusion_matrix(y_true, y_pred)
print("\n --- CONFUSION MATRIX ---\n")
disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=["Action Items","FYI", "Not Important"]) # there are no action items from bot messages in this df
disp.plot()

In [None]:
# Performance on non-bot messages
eval_df_non_bot = df_s1_test[~df_s1_test['channel_name'].isin([ 'deployments', 'form-publish-alerts',
                        'form-generated-alerts',  'shortcut-updates',
                        'demo-form-filled-alerts',  'sentry-errors', 'watchdog_engine',
                        'aws-updates', 'ph-reviews', 'new-user-alerts',  'dc-sentry',  'dc-alerts'])]
y_pred = []
y_true = eval_df_non_bot['category'].values

for i, row in eval_df_non_bot.iterrows():
    y_pred.append(ensemble_predictor(row['body']))

report = classification_report(y_true, y_pred)

print(f"\n--- CLASSIFICATION REPORT ---\n{report}")
cf_matrix = confusion_matrix(y_true, y_pred)
print("\n --- CONFUSION MATRIX ---\n")
disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=["Action Items", "FYI", "Not Important"]) # there are no action items from bot messages in this df
disp.plot()

In [None]:
# Training Loss plot
steps = []
losses = []
for x in s1_model.__dict__['model_card_data']['eval_lines_list']:
    steps.append(x['Step'])
    losses.append(x['Training Loss'])
plt.plot(steps, losses)
plt.xlabel("Steps")  # add X-axis label
plt.ylabel("Train Loss")  # add Y-axis label
plt.show()

In [None]:
s1_model = s1_model.to('cpu')
s2_model = s2_model.to('cpu')
import joblib
joblib.dump(s1_model, '/content/drive/MyDrive/Zivy Files/s1_model_24_apr.model')
joblib.dump(s2_model, '/content/drive/MyDrive/Zivy Files/s2_model_24_apr.model')

# Hyper-parameter tuning

In [None]:
! pip install -q optuna

# Model Architecture New

In [None]:
! pip install -q pytorch-lightning sentence-transformers

In [None]:
! pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
from typing import List
import numpy as np
import pytorch_lightning as pl
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval
from torch import nn
from torch.nn import functional as F
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

class MessageClassificationDataset(Dataset):
    def __init__(self, df: pd.DataFrame, emb_model: str = 'all-MiniLM-L6-v2', output_field_name='category'):
        self.data = df
        self.embedding_model = embedding_model
        self.embedding_size = self.embedding_model.get_sentence_embedding_dimension()
        self.n_previous_msgs = 3
        self.n_future_msgs = 1
        self.output_field_name = output_field_name

    def __len__(self):
        return len(self.data)

    def _padded_embeddings(self, str_list: List[str], out_rows: int):
        if not str_list:
            return np.zeros((out_rows, self.embedding_size))  # Return an all-zero array
        else:
            embeddings = [self.embedding_model.encode(s) for s in str_list]
            padded_embeddings = np.zeros((out_rows, self.embedding_size))
            padded_embeddings[:len(embeddings)] = np.array(embeddings)

            return padded_embeddings


    def __getitem__(self, index):
        row = self.data.iloc[index]
        message_embs = self._padded_embeddings([row['body']], 1)
        previous_embs = self._padded_embeddings(row['previous_messages'], self.n_previous_msgs)
        future_embs = self._padded_embeddings(row['future_messages'], self.n_future_msgs)
        embedding = np.concatenate((message_embs, previous_embs, future_embs), axis=0)
        mapping = {
            'Action Items': 0,
            "FYI": 1,
            "Not Important": 2
        }
        y = [mapping[row[self.output_field_name]]]
        item = {
            'embedding': torch.from_numpy(embedding.reshape(-1,)).float(),
            'category': torch.as_tensor(y)
        }

        return item


class MessageClassificationDataModule(pl.LightningDataModule):
    def __init__(self, csv_path: str, batch_size: int = 32):
        super().__init__()
        df = pd.read_csv(csv_path)
        df.dropna(subset=['category'], how='all', inplace=True) # these nan's are coming from skipped labelling messages
        df = df[['body', 'category', 'channel_name', 'previous_messages', 'future_messages']]
        df['important'] = df['category'].apply(lambda x: 1 if x in ['FYI', 'Action Items'] else 0)
        df['previous_messages'] = df['previous_messages'].apply(literal_eval)
        df['future_messages'] = df['future_messages'].apply(literal_eval)
        df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=0)
        df_train, df_val = train_test_split(df_train, test_size=0.1, stratify=df_train['category'], random_state=0)

        self.train = MessageClassificationDataset(df_train) #72%
        self.val = MessageClassificationDataset(df_val) # 8%
        self.test = MessageClassificationDataset(df_test)# 20%

        self.batch_size = batch_size


    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.batch_size)

class MessageClassifier(nn.Module):
    def __init__(self, emb_size):
        super(MessageClassifier, self).__init__()
        self.fc1 = nn.Linear(5*emb_size, 64)  # dense layer 1
        self.fc2 = nn.Linear(64, 64)  # dense layer 2
        self.fc3 = nn.Linear(64, 3)  # output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # ReLU activation in dense layers
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class LitMessageClassifier(pl.LightningModule):
    def __init__(self, model, learning_rate=1e-3):
        super().__init__()
        self.model = model
        self.learning_rate = learning_rate

    def training_step(self, batch, batch_idx):
        X = batch['embedding']
        y_hat = self.model(X) # logits
        mapping = {
            'Action Items': 0,
            "FYI": 1,
            "Not Important": 2
        }
        y = batch['category'].reshape(-1,)

        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X = batch['embedding']
        y_hat = self.model(X) # logits
        mapping = {
            'Action Items': 0,
            "FYI": 1,
            "Not Important": 2
        }
        y = batch['category'].reshape(-1,)

        loss = F.cross_entropy(y_hat, y)
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)
        return optimizer

dm = MessageClassificationDataModule(csv_path="/content/drive/MyDrive/Zivy Files/labelled_17_apr.csv")
model = LitMessageClassifier(
    model=MessageClassifier(emb_size=dm.train.embedding_size),
    learning_rate=0.002754228703338169
)
early_stopping = EarlyStopping('val_loss', patience=5)
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    save_top_k=1,
    mode='min',
    filename='best_model.ckpt'
)

# pl.seed_everything(0, workers=True)
# Got RunTime Error for doing seed everything
# RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information, go to https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
trainer = pl.Trainer(accelerator="cuda", devices=1, callbacks=[checkpoint_callback, early_stopping])
trainer.fit(model=model, datamodule=dm)

In [None]:
from sklearn.metrics import classification_report
y_true = []
y_pred = []
model.eval()
with torch.no_grad():
    for batch in dm.test_dataloader():
        X = batch['embedding']
        y = batch['category']
        y_hat = model.model(X) # logits
        probabilities = F.softmax(y_hat, dim=1)
        top_p, top_class = probabilities.topk(1, dim = 1)
        y_true.extend(list(y.numpy().reshape(-1,)))
        y_pred.extend(list(top_class.numpy().reshape(-1,)))

print(classification_report(y_true, y_pred))

In [None]:
t = pl.tuner.tuning.Tuner(trainer=trainer)
lr_finder = t.lr_find(model=model, datamodule=dm)
# Results can be found in
print(lr_finder.results)

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()

# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()

In [None]:
new_lr