# Fine-tuned ALBERT Model for Constructiveness Detection in Steam Reviews
## *Sentiment-Analysis of Videogame Reviews on the Platform ”Steam” with a Focus on the Detection and Classification of <b>Constructiveness</b>*
---
### <u>NOTEBOOK **3**/5</u>: This Notebook handles the fine-tuning process using the filtered, preprocessed and annotated Steam Reviews. In this case *bert-base-uncased* is used.

In [None]:
# Package Installs
!pip install -U pip
!pip install pandas numpy datasets transformers accelerate scikit-learn tensorboard
# Installs TensorFlow from the NVIDIA repo
!pip install nvidia-pyindex
!pip install nvidia-tensorflow[horovod]
# PyTorch with CUDA 12.4 support
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# Specific version of protobuf (Ensures compatibility with the installed tensorflow version)
!pip uninstall -y protobuf
!pip install protobuf==3.20.*
!pip install -q wandb

In [None]:
# Imports
import pandas as pd
#from google.colab import files
import IPython
import io
import os
from datasets import Dataset, DatasetDict
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import tensorflow as tf
from datetime import datetime
import wandb
import torch, gc
import time

In [None]:
import sys

# Flag that automaticaly configures the notebook depending on if it is run in google colab or locally
RUNNING_IN_GOOGLE_COLAB = "google.colab" in sys.modules
print("Running in Google Colab" if RUNNING_IN_GOOGLE_COLAB else "Running locally")

In [None]:
# Logs into Weights & Biases Account
wandb.login()

In [None]:
# Environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_MODE"] = "online"

In [None]:
# System Information
# Prints all sorts of system information (CPU, GPU, RAM, CUDA Installed, CUDA Version, RAM) about the google colab runtime
print("\033[1m" + "GPU Information" + "\033[0m")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
!nvidia-smi
print()
print("\033[1m" + "CPU Information" + "\033[0m")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
!cat /proc/cpuinfo
print()
print("\033[1m" + "Memory Information" + "\033[0m")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
!cat /proc/meminfo
print()
print("\033[1m" + "NVidia CUDA Information" + "\033[0m")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
!nvcc --version
print()
print("\033[1m"+ "CUDA Installation Check"+ "\033[0m")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

import torch
torch.cuda.is_available()
try:
  print("CUDA Installation Check Result: " + "\033[3m" + torch.cuda.get_device_name(0) + "\033[3m")
except:
  if RUNNING_IN_GOOGLE_COLAB:
    print("No GPU found. You might be connected to a CPU runtime in Google Colab.")
  else:
    print("No GPU found.")

In [None]:
# bert-base-uncased model architecture
from IPython.display import HTML
url = "https://i.ibb.co/YNzJM69/Attention-diagram-transformer.webp"
HTML(f'<img src="{url}" width="1000"/>')

In [None]:
# Sets seeds for reproducability
def set_seed(seed):
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
# Imports and loads the filtered, preprocessed, training-csv to use for the fine-tuning process
print("Choose .csv to upload for the fine-tuning process...")
#uploaded = files.upload()
#filename=[key for key in uploaded.keys()][0]
#annotations_df = pd.read_csv(io.BytesIO(uploaded[filename]))
#annotations_df.head()
annotations_df = pd.read_csv("/home/samuel/kaggle/preprocessed_annotations.csv")
annotations_df.head()

Choose .csv to upload for the fine-tuning process...


Unnamed: 0,id,game,review,author_playtime_at_review,voted_up,votes_up,votes_funny,earnesty_choice,token_count
0,1,Among Us,This game can suck my balls before I play it a...,6,False,1,0,0,22
1,2,Among Us,Very fun little party game! Even better with f...,11,True,0,0,1,24
2,3,Among Us,if you're lonely don't bother but if you're no...,40,True,2,1,0,27
3,4,Among Us,fun and anoyying,80,True,0,0,0,5
4,5,Among Us,when impostor is sus...,51,True,0,0,0,10


In [None]:
# Concatenating the Columns in the Dataset into a String for training like so: Review: {review}, Playtime: {author_playtime_at_review}, Voted Up: {voted_up}, Upvotes: {votes_up}, Votes Funny: {votes_funny}. A new dataframe is created using the columns "text" for the concatenated string and "constructivity" as the column for "earnesty_choice"
annotations_df["text"] = annotations_df.apply(lambda row: f'Review: {row["review"]}, Playtime: {row["author_playtime_at_review"]}, Voted Up: {row["voted_up"]}, Upvotes: {row["votes_up"]}, Votes Funny: {row["votes_funny"]}', axis=1)
steam_reviews_dataset_df = annotations_df[["text", "earnesty_choice"]].rename(columns={"earnesty_choice": "label"})
steam_reviews_dataset_df.head()

Unnamed: 0,text,label
0,Review: This game can suck my balls before I p...,0
1,Review: Very fun little party game! Even bette...,1
2,Review: if you're lonely don't bother but if y...,0
3,"Review: fun and anoyying, Playtime: 80, Voted ...",0
4,"Review: when impostor is sus..., Playtime: 51,...",0


In [None]:
# training_df.to_csv("training.csv", index=False)
# files.download("training.csv")

In [None]:
# Splitting the dataset onto 80% train, 10% dev and 10% test datasets
train_df, test_dev_df = train_test_split(steam_reviews_dataset_df, test_size=0.2, shuffle=True, random_state=42)
dev_df, test_df = train_test_split(test_dev_df, test_size=0.5, shuffle=True, random_state=42)

In [None]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)  # Preserve index prevents __index_level_0__ from being added as a column by the "from_pandas" method
dev_dataset = Dataset.from_pandas(dev_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

In [None]:
#train_dataset_csv = train_df.to_csv("./kaggle/train.csv", index=False)
#dev_dataset_csv = dev_dataset.to_csv("./kaggle/dev.csv", index=False)
#test_dataset_csv = test_dataset.to_csv("./kaggle/test.csv", index=False)

In [None]:
# Merging the respective splits into a dictionary for ease-of-use
steam_review_dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset
})
print(steam_review_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1168
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 146
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 147
    })
})


In [None]:
steam_review_dataset_dict["train"][0]

{'text': 'Review: Nice coop game, Playtime: 19, Voted Up: True, Upvotes: 0, Votes Funny: 0',
 'label': 0}

In [None]:
checkpoint = "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenizer.pad_token = tokenizer.eos_token  # Used for GPT2 Model



In [None]:
# Tokenize function which tokenizes the steam review text
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True)

In [None]:
tokenized_datasets = steam_review_dataset_dict.map(tokenize_function, batched=True)

Map:   0%|                                      | 0/1168 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|█████████████████████████| 1168/1168 [00:00<00:00, 8347.54 examples/s]
Map: 100%|███████████████████████████| 146/146 [00:00<00:00, 8071.60 examples/s]
Map: 100%|███████████████████████████| 147/147 [00:00<00:00, 5911.78 examples/s]


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# model.config.pad_token_id = model.config.eos_token_id  # Used for GPT2 Model

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="macro")
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="macro")
  recall = recall_score(labels, preds, average="macro")
  return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [None]:
def plot_confusion_matrix(labels, preds, model_name):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix for {model_name}")
    plt.show()

In [None]:
batch_size = 2
model_name = f"{checkpoint}-finetuned-steam-reviews"
log_dir = "/home/samuel/kaggle/bert_finetuning_logs/fit"
training_args = TrainingArguments(output_dir = f'/home/samuel/kaggle/finetuned_models/{model_name}',
                                  num_train_epochs = 50,
                                  fp16=True,
                                  seed = 42,  # Seed for reproducability
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  eval_strategy="epoch",
                                  disable_tqdm = False,
                                  logging_steps = 10,
                                  logging_dir = log_dir,
                                  log_level="info",
                                  report_to=["wandb", "tensorboard"],
                                  save_strategy = "epoch",
                                  load_best_model_at_end=True)

In [None]:
# Initializes the Trainer with the training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Using auto half precision backend


In [None]:
# Used for xlnet model
# for name, param in trainer.model.named_parameters():
#    if not param.is_contiguous():
#        param.data = param.data.contiguous()

In [None]:
# Clears any logs from previous runs
!rm -rf ./bert_finetuning_logs/

# Initializes a new Weights & Biases Run
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run = wandb.init(
    name=f"steam-reviews-finetuning-run-{current_time}",
    project=f'{checkpoint}-finetuned-steam-reviews',
    sync_tensorboard=True
)

# Runs the finetuning/training process
trainer.train()

In [None]:
trainer.save_model()

Saving model checkpoint to /home/samuel/kaggle/finetuned_models/xlnet-base-cased-finetuned-steam-reviews
Configuration saved in /home/samuel/kaggle/finetuned_models/xlnet-base-cased-finetuned-steam-reviews/config.json
Model weights saved in /home/samuel/kaggle/finetuned_models/xlnet-base-cased-finetuned-steam-reviews/model.safetensors
tokenizer config file saved in /home/samuel/kaggle/finetuned_models/xlnet-base-cased-finetuned-steam-reviews/tokenizer_config.json
Special tokens file saved in /home/samuel/kaggle/finetuned_models/xlnet-base-cased-finetuned-steam-reviews/special_tokens_map.json


In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])
run.finish()

In [None]:
# Imports the tensorboard jupyter extension
# Tensorboard Dashboard also available at http://localhost:6006/
# (Sometimes only shows up at localhost URL, not in Jupyter Notebook)
%load_ext tensorboard
%tensorboard --logdir /home/samuel/kaggle/bert_finetuning_logs/