In [51]:
import pandas as pd

# Carica il dataset
file_path = 'data\dataset_for_topic_labeling.xlsx'
df = pd.read_excel(file_path)
df_filtered = df[df['topic'].notna()]
df_filtered = df_filtered[df_filtered['reply'].isna()]
df_filtered = df_filtered[df_filtered['topic'] != '3. Studyplan']
df_filtered = df_filtered[df_filtered['text'].notna()]

df_filtered.reset_index(drop=True, inplace=True)

In [52]:
from sklearn.model_selection import train_test_split

# Rimuovi le righe con valori NaN nella colonna 'text'
df_filtered = df_filtered[df_filtered['text'].notna()]

# Seleziona le colonne di testo e i topic
X = df_filtered['text']
y = df_filtered['topic']

# Codifica i topic in numeri
label2id = {label: idx for idx, label in enumerate(y.unique())}
id2label = {idx: label for label, idx in label2id.items()}
y = y.map(label2id)

# Dividi il dataset in set di addestramento e di test in modo stratificato
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [53]:
id2label

{0: '1. Project/assignment/homework',
 1: '2. Exam/oral exam/mid term',
 2: '4. Deadline/important dates',
 3: '5. Grades/marks/results',
 4: '6. Materials/recordings',
 5: '7. class information/class sessions',
 6: '8. Other'}

## Loading the model

In [54]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('conversation_models\DistilBERT\distilBERT_tokenizer')

# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('conversation_models\DistilBERT\distilBERT_model')

# Set the model in evaluation mode
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## How to use: Tokenize and predict

In [55]:
# Function to tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Tokenize the texts
inputs = tokenize_function(X_test.tolist())

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Achieve Predictions
predictions = torch.argmax(logits, dim=-1)

In [56]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='macro')
report = classification_report(y_test, predictions, target_names=list(label2id.keys()))

# Stampa i risultati
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(report)

Accuracy: 0.7133105802047781
F1 Score: 0.6788494393191067
Classification Report:
                                     precision    recall  f1-score   support

     1. Project/assignment/homework       0.77      0.70      0.73        33
         2. Exam/oral exam/mid term       0.60      0.75      0.67        52
        4. Deadline/important dates       0.80      0.40      0.53        10
            5. Grades/marks/results       0.84      0.70      0.76        23
            6. Materials/recordings       0.75      0.83      0.79       100
7. class information/class sessions       0.65      0.56      0.60        39
                           8. Other       0.73      0.61      0.67        36

                           accuracy                           0.71       293
                          macro avg       0.73      0.65      0.68       293
                       weighted avg       0.72      0.71      0.71       293



### Example with just one text

In [57]:
# Function to tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

text = "I'm having trouble with the exam, can you help me?"

# Tokenize the texts
inputs = tokenize_function(text)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Achieve Predictions
prediction = torch.argmax(logits, dim=-1)

In [58]:
print("text = ", text)
print("prediction = ", id2label[prediction.item()])

text =  I'm having trouble with the exam, can you help me?
prediction =  2. Exam/oral exam/mid term
