In [1]:
import numpy as np
import pandas as pd
import sklearn
from tqdm import tqdm
from math import log
from sklearn.model_selection import train_test_split

D:\Anaconda\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
D:\Anaconda\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [2]:
# load data
nyt_path = r"C:\Users\DELL\Desktop\DSC 253\HW-1\HW-1\nyt.csv"
ag_path = r"C:\Users\DELL\Desktop\DSC 253\HW-1\HW-1\ag.csv"
nyt_df = pd.read_csv(nyt_path)
ag_df = pd.read_csv(ag_path)

In [3]:
nyt_df.head()

Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [4]:
# download nltk packages and get stopwords
import nltk
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('punkt_tab')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print(stop)

{'some', 'but', "she's", 'wasn', 'their', 'a', 'at', 'has', "should've", 'under', 'with', 're', 'an', "shan't", 'off', 'during', 'after', 'as', "you'll", 'yourselves', 'yours', 'his', 'are', 'who', 'when', 'yourself', "couldn't", 'didn', 'that', 'aren', 'needn', 'weren', 'than', 'same', 'where', 'just', 'couldn', 'ourselves', 't', "won't", 'haven', 'not', 'very', 'which', 'being', 'o', 'we', 'doing', 'again', 'then', 'own', 'himself', 'between', 'while', 'him', "shouldn't", "weren't", 'and', 'over', 'won', 'don', 'to', 'it', 'i', 'he', 'its', 'the', "didn't", "wouldn't", 'if', 'both', 'my', 'most', 'before', 'below', 'will', 'those', 'isn', 'up', 'about', 'y', 'wouldn', 'there', 'each', 'myself', 'd', 'into', 'too', "you're", 'because', 'through', 'hasn', 'had', 'hadn', 'ours', 'this', 'on', 'so', 'whom', 'does', "hasn't", 'ma', "don't", 'more', 'only', 'them', "haven't", 'did', "it's", "hadn't", 'mightn', 'further', "you've", 'against', 'our', 'was', 'can', 'll', 'such', "that'll", 'y

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
# define preprocessing function

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 

# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3: stemming
        if stemming:
            words = [ps.stem(word) for word in words if word.lower() not in stop]
        else:
            words = [word for word in words if word.lower() not in stop]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens]

In [7]:
# Compute token frequency over the corpus, prepare for the next step

from tqdm import tqdm
from collections import defaultdict

DF = defaultdict(float)
for doc in tqdm(nyt_df.text):
    tokens = pre_processing_by_nltk(doc)
    for token in set(tokens):
        DF[token] += 1

100%|████████████████████████████████████████████████████████████████████████████| 11519/11519 [02:43<00:00, 70.56it/s]


In [8]:
# Construct IDF and vocabulary dictionary

IDF, vocab = dict(), dict()
for token in DF:
    if DF[token] < 50:
        # this becomes an unk
        pass
    else:
        vocab[token] = len(vocab) # gives an ID to the token
        IDF[token] = 1 + log(len(nyt_df.text) / DF[token])

# Add unk token to vocab
IDF['<UNK>'] = 1
vocab['<UNK>'] = len(vocab)
print(len(DF), len(vocab))

123888 5987


In [31]:
# define preprocessing function

def tfidf_feature_extractor(doc, vocab, IDF):
    tokens = pre_processing_by_nltk(doc)
    for i, token in enumerate(tokens):
        if token not in vocab:
            tokens[i] = '<UNK>'
    TF = defaultdict(int)
    for token in tokens:
        TF[token] += 1
    x = [0] * len(vocab)
    for token in set(tokens):
        tfidf = log(TF[token] + 1) * IDF[token]
        token_id = vocab[token]
#         print(token, TF[token], IDF[token])
        x[token_id] = tfidf # this will be a dense matrix
    return x

In [57]:
# Extract feature for binary-x1, Frequency-x2, TF-IDF-x3

x1,x2,x3 = [],[],[]
for doc in tqdm(nyt_df.text):
    tokens = pre_processing_by_nltk(doc)
    v1,v2 = [0]*len(vocab),[0]*len(vocab)
    for token in tokens:
        if token not in vocab:
            v1[-1] = 1
            v2[-1] += 1
        else:
            i = vocab[token]
            v1[i] = 1
            v2[i] += 1
    v3 = tfidf_feature_extractor(doc, vocab, IDF)
    x1.append(v1)
    x2.append(v2)
    x3.append(v3)

100%|████████████████████████████████████████████████████████████████████████████| 11519/11519 [05:26<00:00, 35.30it/s]


In [11]:
y = nyt_df.label

In [59]:
# Split dataset
x1_train,x1_test_val,y1_train,y1_test_val = train_test_split(x1,y,test_size=0.2,random_state=42)
x1_test,x1_val,y1_test,y1_val = train_test_split(x1_test_val,y1_test_val,test_size=0.5,random_state=42)

x2_train,x2_test_val,y2_train,y2_test_val = train_test_split(x2,y,test_size=0.2,random_state=42)
x2_test,x2_val,y2_test,y2_val = train_test_split(x2_test_val,y2_test_val,test_size=0.5,random_state=42)

x3_train,x3_test_val,y3_train,y3_test_val = train_test_split(x3,y,test_size=0.2,random_state=42)
x3_test,x3_val,y3_test,y3_val = train_test_split(x3_test_val,y3_test_val,test_size=0.5,random_state=42)

In [13]:
# Train model for (a) Binary
from sklearn.linear_model import LogisticRegressionCV
clf1 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(x1_train, y1_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.7min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished


In [14]:
# Train model for (b) Frequency
clf2 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(x2_train, y2_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.4min remaining: 11.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
# Train model for (c) TF-IDF
clf3 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(x3_train, y3_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.1min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min finished


In [32]:
# Analyze function
from sklearn.metrics import f1_score,accuracy_score
def analyze(clf,x_test,y_test):
    y_pred = clf.predict(x_test)
    f1_macro = f1_score(y_test,y_pred,average='macro')
    f1_micro = f1_score(y_test,y_pred,average='micro')
    accuracy = accuracy_score(y_test,y_pred)
    print(f"accuracy={accuracy:.3f}, macro f1={f1_macro:.3f}, micro f1={f1_micro:.3f}")
    return accuracy,f1_macro,f1_micro

In [163]:
q1_res = pd.DataFrame(columns=['Accuracy','Macro F1','Micro F1'])

In [164]:
q1_res.loc['Binary']=analyze(clf1,x1_test,y1_test)

accuracy=0.9835069444444444, macro f1=0.961933235988707, micro f1=0.9835069444444444


In [165]:
q1_res.loc['Frequency']=analyze(clf2,x2_test,y2_test)

accuracy=0.9869791666666666, macro f1=0.9700064581976999, micro f1=0.9869791666666666


In [166]:
q1_res.loc['TF-IDF']=analyze(clf3,x3_test,y3_test)

accuracy=0.9826388888888888, macro f1=0.9587116733191015, micro f1=0.9826388888888888


In [167]:
q1_res

Unnamed: 0,Accuracy,Macro F1,Micro F1
Binary,0.983507,0.961933,0.983507
Frequency,0.986979,0.970006,0.986979
TF-IDF,0.982639,0.958712,0.982639


**Analysis:**

All three methods achieve pretty well performance (Above 98% accuracy). Besides, we can also observe that both Frequency and TF-IDF features slightly outperform Binary feature, because they encode more information: Frequency and TF-IDF take into account the number of occurance of a word, whereas Binary feature only considers the presence of word in a document.
Also, the fact that Frequency has slightly better performance than TF-IDF suggests that more complicated feature does not always imply better results.

# 2.Word2Vec

In [6]:
# using publicly available pre-trained Glove embeddings as word vector
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = r"C:\Users\DELL\Desktop\DSC 253\glove.6B\glove.6B.100d.txt"
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


In [7]:
# Train Word2Vec (using gensim package) on AGNews/NYT text data
import pandas as pd
import re
import numpy as np
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer 
from gensim.models import Word2Vec
import nltk

nltk.download('punkt')

nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
ps = PorterStemmer() 

def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words if word.lower() not in stop]
        else:
            words = [word for word in words if word.lower() not in stop]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens]


ag_path = r"C:\Users\DELL\Desktop\DSC 253\HW-1\HW-1\ag.csv"
nyt_path = r"C:\Users\DELL\Desktop\DSC 253\HW-1\HW-1\nyt.csv"
ag_df = pd.read_csv(ag_path)
ag_df['tokens'] = ag_df['text'].apply(pre_processing_by_nltk)
nyt_df = pd.read_csv(nyt_path)
nyt_df['tokens'] = nyt_df['text'].apply(pre_processing_by_nltk)

word2vec_model_ag = Word2Vec(sentences=ag_df['tokens'], vector_size=100, window=5, min_count=2, sg=1, workers=4)
word2vec_model_nyt = Word2Vec(sentences=nyt_df['tokens'], vector_size=100, window=5, min_count=2, sg=1, workers=4)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Function to compute document vectors by averaging word vectors
def document_vector(tokens,word2vec_model):
    valid_words = [word for word in tokens if word in word2vec_model.wv]
    if valid_words:
        return np.mean(word2vec_model.wv[valid_words], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

def document_vector_ag(tokens):
    return document_vector(tokens,word2vec_model_ag)

def document_vector_nyt(tokens):
    return document_vector(tokens,word2vec_model_nyt)
    
def document_vector_glove(tokens):
    valid_words = [word for word in tokens if word in model]
    if valid_words:
        return np.mean(model[valid_words], axis=0)
    else:
        return np.zeros(model.vector_size)
    
# Compute document vectors for all documents
nyt_df['doc_vector_ag'] = nyt_df['tokens'].apply(document_vector_ag)
nyt_df['doc_vector_nyt'] = nyt_df['tokens'].apply(document_vector_nyt)
nyt_df['doc_vector_glove'] = nyt_df['tokens'].apply(document_vector_glove)

# Extract features and labels for further use
x_ag = np.vstack(nyt_df['doc_vector_ag'].values)
x_nyt = np.vstack(nyt_df['doc_vector_nyt'].values)
x_glove = np.vstack(nyt_df['doc_vector_glove'].values)

In [12]:
X1_train,X1_test_val,Y1_train,Y1_test_val = train_test_split(x_glove,y,test_size=0.2,random_state=42)
X1_test,X1_val,Y1_test,Y1_val = train_test_split(X1_test_val,Y1_test_val,test_size=0.5,random_state=42)

X2_train,X2_test_val,Y2_train,Y2_test_val = train_test_split(x_ag,y,test_size=0.2,random_state=42)
X2_test,X2_val,Y2_test,Y2_val = train_test_split(X2_test_val,Y2_test_val,test_size=0.5,random_state=42)

X3_train,X3_test_val,Y3_train,Y3_test_val = train_test_split(x_nyt,y,test_size=0.2,random_state=42)
X3_test,X3_val,Y3_test,Y3_val = train_test_split(X3_test_val,Y3_test_val,test_size=0.5,random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegressionCV
CLF1 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(X1_train, Y1_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.5s remaining:    9.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.0s finished


In [15]:
CLF2 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(X2_train, Y2_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.4s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [16]:
CLF3 = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=42,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300).fit(X3_train, Y3_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.5s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [27]:
q2_df = pd.DataFrame(columns=['Accuracy','Macro F1','Micro F1'])
q2_df.loc['Glove'] = analyze(CLF1,X1_test,Y1_test)

accuracy=0.980, macro f1=0.952, micro f1=0.980


In [28]:
q2_df.loc['Word2Vec AGNews'] = analyze(CLF2,X2_test,Y2_test)

accuracy=0.979, macro f1=0.952, micro f1=0.979


In [29]:
q2_df.loc['Word2Vec NYT'] = analyze(CLF3,X3_test,Y3_test)

accuracy=0.984, macro f1=0.961, micro f1=0.984


In [30]:
q2_df

Unnamed: 0,Accuracy,Macro F1,Micro F1
Glove,0.980035,0.952107,0.980035
Word2Vec AGNews,0.979167,0.951996,0.979167
Word2Vec NYT,0.984375,0.961177,0.984375


**Analysis:**

(a) Word2Vec trained on NYT performs the best across all metrics. This is expected since the Word2Vec model was trained specifically on the NYT dataset, which matches the domain of the test set. The word vectors are highly tailored to this particular dataset, making them more effective in capturing the nuances of the text. 
Pre-trained GloVe embeddings also perform very well, but slightly below (iii). GloVe embeddings are trained on a large, diverse corpus, which allows them to generalize well across a wide range of tasks. However, because they are not specifically tuned to the NYT dataset, they fall slightly behind the NYT-specific Word2Vec vectors in performance.
Word2Vec trained on AGNews shows slightly lower performance compared to (i) and (iii). Since the model is trained on a different domain (AGNews) than the NYT test set, the word vectors may not capture the context and language patterns as effectively for this task. Therefore, its performance is slightly lower compared to both the general-purpose GloVe embeddings and the NYT-specific Word2Vec embeddings.

(b) Disadvantages: Firstly, averaging does not preserve the word order or capture important syntactic structures, which can be crucial for understanding the meaning of a sentence.Besides, averaging treats all words equally, without accounting for the varying importance of words in a document. Rare or contextually significant words are given the same weight as common or less important ones.
Idea to Overcome:
For the second disadvantage, one idea to overcome is to use a weighted averaging approach where each word's vector is weighted by its importance in the document, such as TF-IDF. In this case, words that are more relevant to the document have a higher influence on the final vector. This can balance the contribution of frequent words versus rare, but contextually important words.

# 3.BERT

In [89]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from datasets import load_dataset

# Load pretrained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(nyt_df.label.unique()))

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [139]:
# Convert label from string to int
label_dic = dict()
for i,label in enumerate(nyt_df.label.unique()):
    label_dic[label] = i
nyt_df['label'] = nyt_df['label'].apply(lambda x:label_dic[x])

In [140]:
# Tokenization
from datasets import Dataset

dataset = Dataset.from_pandas(nyt_df[['text','label']])
def tokenize_function(df):
    return tokenizer(df['text'], padding='max_length', truncation=True, max_length=64)

tokenized_datasets_raw = dataset.map(tokenize_function, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [141]:
# Split Dataset into train, test, validation
from datasets import DatasetDict
train_testvalid = tokenized_datasets_raw.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
tokenized_datasets = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']})

In [145]:
print(tokenized_datasets['train'].features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [None]:
# Create Dataloader for the convience of training
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16)

# use GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer and scheduler setup
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name='linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


In [None]:
# Evaluation function to compute accuracy, macro F1, and micro F1
from sklearn.metrics import accuracy_score, f1_score
def evaluate(model, dataloader):
    model.eval()
    true_labels, predictions = [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['label'].cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    micro_f1 = f1_score(true_labels, predictions, average='micro')

    return accuracy, macro_f1, micro_f1

# Evaluate the model on the test set
accuracy, macro_f1, micro_f1 = evaluate(model, test_dataloader)

print(f"Accuracy: {accuracy:.3f}")
print(f"Macro F1-Score: {macro_f1:.3f}")
print(f"Micro F1-Score: {micro_f1:.3f}")