In [1]:
import pandas as pd

# Carica il dataset
file_path = 'data\dataset_for_topic_labeling.xlsx'
df = pd.read_excel(file_path)
df_filtered = df[df['topic'].notna()]
df_filtered = df_filtered[df_filtered['reply'].isna()]
df_filtered = df_filtered[df_filtered['topic'] != '3. Studyplan']
df_filtered = df_filtered[df_filtered['text'].notna()]

df_filtered.reset_index(drop=True, inplace=True)

In [2]:
from sklearn.model_selection import train_test_split

# Rimuovi le righe con valori NaN nella colonna 'text'
df_filtered = df_filtered[df_filtered['text'].notna()]

# Seleziona le colonne di testo e i topic
X = df_filtered['text']
y = df_filtered['topic']

# Codifica i topic in numeri
label2id = {label: idx for idx, label in enumerate(y.unique())}
id2label = {idx: label for label, idx in label2id.items()}
y = y.map(label2id)

# Dividi il dataset in set di addestramento e di test in modo stratificato
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
id2label

{0: '1. Project/assignment/homework',
 1: '2. Exam/oral exam/mid term',
 2: '4. Deadline/important dates',
 3: '5. Grades/marks/results',
 4: '6. Materials/recordings',
 5: '7. class information/class sessions',
 6: '8. Other'}

## Loading the model

### BERT

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
path = 'trained_models\BERT\BERT_model_tokenizer'

# Load the DistilBERT tokenizer
tokenizer = BertTokenizer.from_pretrained(path)

# Load the DistilBERT model
model = BertForSequenceClassification.from_pretrained(path)

# Set the model in evaluation mode
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### DistilBERT

In [7]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('trained_models\DistilBERT\distilBERT_tokenizer')

# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('trained_models\DistilBERT\distilBERT_model')

# Set the model in evaluation mode
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### MobileBERT

In [10]:
from datasets import Dataset, load_metric
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
import torch
import numpy as np

# Initialize the MobileBERT tokenizer
tokenizer = MobileBertTokenizer.from_pretrained('trained_models\MobileBERT\MobileBERT_tokenizer')

# Initialize the MobileBERT model
model = MobileBertForSequenceClassification.from_pretrained('trained_models\MobileBERT\MobileBERT_model', num_labels=len(label2id))

model.eval()

MobileBertForSequenceClassification(
  (mobilebert): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0-23): 24 x MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_fe

### TinyBERT

In [19]:
from datasets import Dataset, load_metric
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
import torch
import numpy as np

# Initialize the TinyBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('trained_models\TinyBERT\TinyBERT_tokenizer_1')

# Initialize the TinyBERT model
model = BertForSequenceClassification.from_pretrained('trained_models\TinyBERT\TinyBERT_model_1', num_labels=len(label2id))

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, e

### ALBERT

In [3]:
from datasets import Dataset, load_metric
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
import torch
import numpy as np

# Initialize the ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('trained_models\ALBERT\ALBERT_tokenizer')

# Initialize the ALBERT model
model = AlbertForSequenceClassification.from_pretrained('trained_models\ALBERT\ALBERT_model', num_labels=len(label2id))

model.eval()

  from .autonotebook import tqdm as notebook_tqdm


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

## How to use: Tokenize and predict

In [6]:
# Function to tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Tokenize the texts
inputs = tokenize_function(X_test.tolist())

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Achieve Predictions
predictions = torch.argmax(logits, dim=-1)

In [7]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, confusion_matrix, f1_score
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='macro')
report = classification_report(y_test, predictions, target_names=list(label2id.keys()))

# Stampa i risultati
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(report)

Accuracy: 0.6962457337883959
F1 Score: 0.6508339247160325
Classification Report:
                                     precision    recall  f1-score   support

     1. Project/assignment/homework       0.79      0.70      0.74        33
         2. Exam/oral exam/mid term       0.61      0.65      0.63        52
        4. Deadline/important dates       0.75      0.30      0.43        10
            5. Grades/marks/results       0.83      0.65      0.73        23
            6. Materials/recordings       0.70      0.85      0.77       100
7. class information/class sessions       0.65      0.67      0.66        39
                           8. Other       0.75      0.50      0.60        36

                           accuracy                           0.70       293
                          macro avg       0.73      0.62      0.65       293
                       weighted avg       0.70      0.70      0.69       293



### Example with just one text

In [17]:
# Function to tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

text = "I'm having trouble with the exam, can you help me?"

# Tokenize the texts
inputs = tokenize_function(text)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Achieve Predictions
prediction = torch.argmax(logits, dim=-1)

In [18]:
print("text = ", text)
print("prediction = ", id2label[prediction.item()])

text =  I'm having trouble with the exam, can you help me?
prediction =  2. Exam/oral exam/mid term
