In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!pip install transformers
!pip install -U -q PyDrive
!pip install sentencepiece
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

###Imports

In [None]:
# Import required libraries
import torch
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import random
from tqdm.auto import tqdm
import random
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

In [None]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")


Found device: Tesla T4, n_gpu: 1


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download helper functions file
helper_file = drive.CreateFile({'id': '16HW-z9Y1tM3gZ_vFpJAuwUDohz91Aac-'})
helper_file.GetContentFile('helpers.py')
print('helper file downloaded! (helpers.py)')



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
success!
helper file downloaded! (helpers.py)


### Data download and preprocessing

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Convert all text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(tokens)

    # Remove extra whitespaces
    text = re.sub(' +', ' ', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from helpers import tokenize_and_format, flat_accuracy

df = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df = df.rename(columns={'review':'text'})
df = df[['text', 'sentiment']]

# How much of the dataset to use
data_size = 0.2
df = df.sample(frac=data_size, random_state=42)

df['text'] = df['text'].apply(clean_text)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df['sentiment'] = df['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df[df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df.drop(empty_rows.index, inplace=True)

### Train/val/test splits

In [None]:
# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df[df['sentiment'] == sentiment_map['positive']]
negative_reviews = df[df['sentiment'] == sentiment_map['negative']]

# Shuffle each of the two subsets randomly
positive_reviews = positive_reviews.sample(frac=1, random_state=42)
negative_reviews = negative_reviews.sample(frac=1, random_state=42)

# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

# Merge the corresponding subsets from each sentiment back together to form the final training, validation, and test sets
train_set = pd.concat([train_pos, train_neg], ignore_index=True)
val_set = pd.concat([val_pos, val_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)

In [None]:
# tokenize train, test and val individually
# For train

texts = train_set.text.values
labels = train_set.sentiment.values

# tokenize_and_format() is a helper function provided in helpers.py
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids, dim=0)
attention_masks_train = torch.cat(attention_masks, dim=0)
labels_train = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original:  saw mill valley film festival hard believe ms bloms directorial debut beautifully paced performed large cast characters could anne tyler novel ie layered back story potential futures false notes surprising bursts humor amidst selfinflicted anxiety real earthshattering dilemmas saw best youth recognize well drawn characters small moments even story moves briskly along really hope gets distribution usa live fairly sophisticated film market yet rarely get swedish films kind
Token IDs: tensor([[  101,  2387,  4971,  3028,  2143,  2782,  2524,  2903,  5796,  1038,
         21297,  2015, 21635,  2834, 17950, 13823,  2864,  2312,  3459,  3494,
          2071,  4776,  7482,  3117, 29464, 21323,  2067,  2466,  4022, 17795,
          6270,  3964, 11341, 19239,  8562, 17171,  2969,  2378, 29301,  2098,
         10089,  2613,  3011,  7377, 19567, 21883,  2015,  2387,  2190,  3360,
          6807,  2092,  4567,  3494,  2235,  5312,  2130,  2466,  5829, 28022,
          2135,  2247,  2428

In [None]:
# For test
texts = test_set.text.values
labels = test_set.sentiment.values

# tokenize_and_format() is a helper function provided in helpers.py
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  one movies dont require brain thinking funny time pass forgot next hour really surprised john abrahams acting usually playing gangster like character emotionless faceso playing complete opposite successfullyby managing shine amongst comic geniuses paresh rawal akshaye kumar also quite surprised akshayes girls roles dont require much talent mostly moaning akshayes dissapearenceto girls surprised managed establish actual persona could differentiate good thing also majority songs goodit colourful fun boring sunday evening sure lighten mood
Token IDs: tensor([[  101,  2028,  5691,  2123,  2102,  5478,  4167,  3241,  6057,  2051,
          3413,  9471,  2279,  3178,  2428,  4527,  2198,  8181,  2015,  3772,
          2788,  2652, 20067,  2066,  2839,  7603,  3238,  5344,  2080,  2652,
          3143,  4500,  5147,  3762,  6605, 12342,  5921,  5021, 11067,  2229,
         11968,  9953,  6315,  2389, 17712,  7377,  6672,  9600,  2036,  3243,
          4527, 17712,  7377, 23147,  30

In [None]:
# For val
texts = val_set.text.values
labels = val_set.sentiment.values

# tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_val = torch.cat(input_ids, dim=0)
attention_masks_val = torch.cat(attention_masks, dim=0)
labels_val = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  govind nihalanis directorial venture vijay tendulkars novel brilliant om puri plays inspector velankar forced protect underworld rama shetty played brilliantly sadahiv amrapurkar govind nihlans talked movie good classic film smita patil plays female lead opposite om puri naseeruddin shah brilliant cameo role although sadashiv amrapurkar scenes movie dominates movie sadashiv amrapurkars acting debutom puri national award film best actor filmfare award winner best filmstorysupporting actorsadashiv amrapurkar
Token IDs: tensor([[  101, 18079, 22254,  9152, 19531,  8977, 21635,  6957, 17027,  7166,
          5313,  6673,  2015,  3117,  8235, 18168, 16405,  3089,  3248,  7742,
          2310,  5802,  6673,  3140,  4047, 13607, 14115,  2016, 15353,  2209,
          8235,  2135,  6517,  4430, 12848,  2572,  2527,  5311,  6673, 18079,
         22254,  9152,  7317,  6962,  5720,  3185,  2204,  4438,  2143, 15488,
          6590,  6986,  4014,  3248,  2931,  2599,  4500, 18168, 16405,

In [None]:
#printing out len of train,test val
total = len(df)
num_train = len(train_set)
num_val = len(val_set)
num_test = len(test_set)

print('Train Set Size: ',num_train)
print('Validation Set Size: ',num_val)
print('Test Set Size: ',num_test)

Train Set Size:  6999
Validation Set Size:  2010
Test Set Size:  991


In [None]:
# make lists of 3-tuples 
train_dataset=[]
for i in range(num_train):
  train_dataset.append((input_ids_train[i], attention_masks_train[i], labels_train[i]))

val_dataset=[]
for i in range(num_val):
  val_dataset.append((input_ids_val[i], attention_masks_val[i], labels_val[i]))

test_dataset=[]
for i in range(num_test):
  test_dataset.append((input_ids_test[i], attention_masks_test[i], labels_test[i]))


### Training

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Define the BERT model for sequence classification








# Tell pytorch to run this model on the GPU.
model.cuda()

# Set the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataset) * 4
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
#Commented fine tuning code for testing

'''# Define the training loop
model.to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
best_val_loss = float("inf")
num_epochs = 2
train_loss = []
val_loss = []
for epoch in range(num_epochs):
  # Train the model
  model.train()
  epoch_loss = 0

  train_loop = tqdm(train_loader, desc=f'Training Epoch {epoch+1}', leave=True)

  for batch in train_loop:
      
    inputs = {'input_ids': batch[0].to(device),
              'attention_mask': batch[1].to(device),
              'labels': batch[2].to(device)}
    optimizer.zero_grad()
    outputs = model(**inputs)
    loss = outputs[0]

    loss.backward()
    optimizer.step()
    scheduler.step()

    epoch_loss += loss.item()
    train_loop.set_postfix(loss=loss.item())

  epoch_loss /= len(train_loader)
  train_loss.append(epoch_loss)

  # Evaluate the model on the validation set
  model.eval()
  val_preds = []
  val_labels = []
  epoch_val_loss = 0

  with torch.no_grad():
    for batch in val_loader:
        
      inputs = {'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device),
                'labels': batch[2].to(device)}
      
      outputs = model(**inputs)
      loss = outputs[0]
      epoch_val_loss += loss.item()

      logits = outputs[1]
      preds = torch.argmax(logits, axis=1)
      val_preds.extend(preds.cpu().numpy())
      val_labels.extend(batch[2].cpu().numpy())

    epoch_val_loss /= len(val_loader)

  if epoch_val_loss < best_val_loss:
    best_val_loss = epoch_val_loss
    
    path = '/content/gdrive/MyDrive/NLP Project/models/BERT_SA_clean100'

    torch.save(model.state_dict(), path+'/model_parameters.pth')

  # Compute the evaluation metrics
  val_accuracy = accuracy_score(val_labels, val_preds)
  val_report = classification_report(val_labels, val_preds, target_names=['positive', 'negative'])
  

  # Print the results for the current epoch
  print('Epoch:', epoch+1, ', Training Loss:', epoch_loss/len(train_loader), ', Validation Loss:', epoch_val_loss, ', Validation Accuracy:', val_accuracy)
  print('Validation Classification Report:')
  print(val_report)
    '''

'# Define the training loop\nmodel.to(device)\ntrain_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)\nval_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)\nbest_val_loss = float("inf")\nnum_epochs = 2\ntrain_loss = []\nval_loss = []\nfor epoch in range(num_epochs):\n  # Train the model\n  model.train()\n  epoch_loss = 0\n\n  train_loop = tqdm(train_loader, desc=f\'Training Epoch {epoch+1}\', leave=True)\n\n  for batch in train_loop:\n      \n    inputs = {\'input_ids\': batch[0].to(device),\n              \'attention_mask\': batch[1].to(device),\n              \'labels\': batch[2].to(device)}\n    optimizer.zero_grad()\n    outputs = model(**inputs)\n    loss = outputs[0]\n\n    loss.backward()\n    optimizer.step()\n    scheduler.step()\n\n    epoch_loss += loss.item()\n    train_loop.set_postfix(loss=loss.item())\n\n  epoch_loss /= len(train_loader)\n  train_loss.append(epoch_loss)\n\n  # Evaluate the model on the valida

##Evaluation

In [None]:
#Load model for evaluations, comment for finetuning

path = '/content/gdrive/MyDrive/NLP Project/models/BERT_SA_clean100'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

<All keys matched successfully>

In [None]:
# Move input tensors to the same device as the model
input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

# Set model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Apply softmax to obtain probabilities
probs = torch.softmax(logits, dim=1)
preds = torch.argmax(probs, dim=1)

# Move predictions and labels back to CPU for evaluation
preds = preds.detach().cpu().numpy()
labels_test = labels_test.cpu().numpy()

# Calculate accuracy
accuracy = accuracy_score(labels_test, preds)
print(f'Test Accuracy: {accuracy:.4f}')

# Calculate F1 score
f1 = f1_score(labels_test, preds, average='weighted')
print(f'F1 Score: {f1:.4f}')











Test Accuracy: 0.8032
F1 Score: 0.8032


### Evaluation on Noisy Data

In [None]:
# Noise funcs 

char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

word_action = ['substitute',
        'delete',
        'swap',
]


def get_action(type):
  if type=="char":
    return random.choice(char_action)
  elif type=="word":
    return random.choice(word_action)


def augment_tweet(tweet, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented tweet.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.OcrAug(),
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SplitAug(),
        naw.SynonymAug(),
        naw.RandomWordAug(aug_p=0.2, action=get_action("word")),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        augmented_tweet = aug.augment(tweet)
    else:
        aug = random.choice(word_augmenters)
        augmented_tweet = aug.augment(tweet)
        
    return augmented_tweet

def add_noise(df, augmentation_percentage, task):

  if task=="sentiment_analysis":
    # Sample 10% of the rows in the DataFrame
    augment_indices = df.sample(frac=augmentation_percentage).index

    # Apply the augment_tweet function to each tweet in the sampled rows
    for index in augment_indices:
        tweet = df.loc[index, 'text']
        augmented_tweet = augment_tweet(tweet)
        df.loc[index, 'text'] = augmented_tweet
    
    return df
  
  elif task=="question_answering":

    # TODO - noise functions for QA

    return df

In [None]:
import random
random.seed(42)
random_noise = random.uniform(0.05, 0.15)
print(random_noise)

# Download data for Testing
df_test = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df_test = df_test.rename(columns={'review':'text'})
df_test = df_test[['text', 'sentiment']]

 
# How much of the dataset to use
data_size = 0.2
df_test = df_test.sample(frac=data_size, random_state=42)

df_test['text'] = df_test['text'].apply(clean_text)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df_test['sentiment'] = df_test['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df_test[df_test['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df_test.drop(empty_rows.index, inplace=True)

df_test = add_noise(df_test, augmentation_percentage=random_noise, task="sentiment_analysis")

# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df_test[df_test['sentiment'] == sentiment_map['positive']]
negative_reviews = df_test[df_test['sentiment'] == sentiment_map['negative']]

# Shuffle each of the two subsets randomly
positive_reviews = positive_reviews.sample(frac=1, random_state=42)
negative_reviews = negative_reviews.sample(frac=1, random_state=42)

print(len(positive_reviews), len(negative_reviews))
# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

# Merge the corresponding subsets from each sentiment back together to form the final training, validation, and test sets
train_set = pd.concat([train_pos, train_neg], ignore_index=True)
val_set = pd.concat([val_pos, val_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)

# For test

texts = test_set.text.values
labels = test_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)


test_dataset=[]
for i in range(num_test):
  test_dataset.append((input_ids_test[i], attention_masks_test[i], labels_test[i]))

import torch

# Specify device
device = torch.device('cpu')

# Load model for evaluations, comment for finetuning
path = '/content/gdrive/MyDrive/NLP Project/models/# Specify device'
device = torch.device('cpu')

# Load model for evaluations, comment for finetuning
path = '/content/gdrive/MyDrive/NLP Project/models/BERT_SA_clean100'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

model.to(device)
input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

# Set model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits
model.load_state_dict(torch.load(path+'/model_parameters.pth', map_location=device))

model.to(device)
input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

# Set model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Apply softmax to obtain probabilities
probs = torch.softmax(logits, dim=1)
preds = torch.argmax(probs, dim=1)

# Move predictions and labels back to CPU for evaluation
preds = preds.detach().cpu().numpy()
labels_test = labels_test.cpu().numpy()

# Calculate accuracy
accuracy = (preds == labels_test).mean()
print(f'Test Accuracy: {accuracy:.4f}')

from sklearn.metrics import f1_score

f1 = f1_score(labels_test, preds, average='weighted')
print(f'F1 Score: {f1:.4f}')

0.11394267984578837
5039 4961
Test Accuracy: 0.7760
F1 Score: 0.7753
