# imports

In [10]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support, f1_score, recall_score, precision_score, accuracy_score
import warnings
warnings.simplefilter('ignore')

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, RUSBoostClassifier
import seaborn as sns
from collections import Counter

# import re
# import unicodedata
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords

from tqdm import trange, tqdm

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import torch


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Getting data

In [11]:
def get_data():
    root = "./drive/MyDrive/inf554/"
    path_to_training = Path(root + "training")
    path_to_test = Path(root + "test")

    def flatten(list_of_list):
        return [item for sublist in list_of_list for item in sublist]
    #####
    # training and test sets of transcription ids
    #####
    training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
    training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
    training_set.remove('IS1002a')
    training_set.remove('IS1005d')
    training_set.remove('TS3012c')

    test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
    test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])


    #####
    # text_baseline: utterances are embedded with SentenceTransformer, then train a classifier.
    #####

    y_training = []
    graph = []
    last_graph_size = 0
    with open(root + "training_labels.json", "r") as file:
        training_labels = json.load(file)

    X_training_original = []
    for transcription_id in tqdm(training_set):
        with open(path_to_training / f"{transcription_id}.json", "r") as file:
            transcription = json.load(file)

        with open(path_to_training / f"{transcription_id}.txt", 'r') as file:
            relations = file.readlines()

        graph.extend([(lambda x: (int(x[0]) + last_graph_size, int(x[2]) + last_graph_size, x[1])) (rel.split(' ')) for rel in relations])

        last_graph_size += len(transcription)


        for utterance in transcription:
            X_training_original.append(utterance["speaker"] + ": " + utterance["text"])

        y_training += training_labels[transcription_id]
    return X_training_original, y_training, graph


# Data

In [12]:
X, y, rel = get_data()

100%|██████████| 97/97 [00:00<00:00, 133.56it/s]


In [13]:
sentences, labels = X, y

In [14]:
categories = {e:i for i, e in enumerate(set(e[2] for e in rel))}; categories

{'Explanation': 0,
 'Narration': 1,
 'Contrast': 2,
 'Continuation': 3,
 'Q-Elab': 4,
 'Background': 5,
 'Correction': 6,
 'Elaboration': 7,
 'Comment': 8,
 'Acknowledgement': 9,
 'Conditional': 10,
 'Question-answer_pair': 11,
 'Parallel': 12,
 'Result': 13,
 'Alternation': 14,
 'Clarification_question': 15}

# Model

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train

In [None]:
# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

# Convert labels to tensor
labels = torch.tensor(labels)


In [20]:
inputs

{'input_ids': tensor([[  101,  7610,  1024,  ...,     0,     0,     0],
        [  101,  7610,  1024,  ...,     0,     0,     0],
        [  101,  7610,  1024,  ...,     0,     0,     0],
        ...,
        [  101,  2033,  1024,  ...,     0,     0,     0],
        [  101, 21318,  1024,  ...,     0,     0,     0],
        [  101,  2033,  1024,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [23]:
from torch.utils.data import Dataset, DataLoader, random_split

# Define a custom dataset
class SentenceDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = SentenceDataset(inputs, labels.tolist())

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))  # 80% for training
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in trange(10):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Print the training loss
    print(f'Epoch: {epoch+1}, Training Loss: {loss.item()}')


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:

# Evaluation loop
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

# Calculate F1-score for the validation set
val_f1 = f1_score(val_labels, val_preds)
print(f'Validation F1 Score: {val_f1}')
