<a href="https://colab.research.google.com/github/cathieG/CSCI420/blob/main/Movie_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install & Import Libraries

!pip install --upgrade transformers datasets scikit-learn -q
!pip install evaluate
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
from sklearn.metrics import accuracy_score, f1_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
# Load Tokenizer & Model

model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Load Dataset

dataset = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
# Tokenize + Format Dataset

def tokenize_function(examples):
    return tokenizer(examples["text"], padding = True, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [10]:
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [13]:
from collections import Counter

# Convert labels to Python ints and count
train_counts = Counter(int(label) for label in train_labels)
test_counts = Counter(int(label) for label in test_labels)

# Print distributions
print(f"Training label distribution: {train_counts}")
print(f"Test label distribution: {test_counts}")


Training label distribution: Counter({0: 10006, 1: 9994})
Test label distribution: Counter({1: 2506, 0: 2494})


In [14]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

batch_size = 16
eval_dataloader = DataLoader(test_dataset, batch_size=batch_size)

all_preds = []
all_labels = []

for batch in eval_dataloader:
    inputs = {
        'input_ids': batch['input_ids'].to(device),
        'attention_mask': batch['attention_mask'].to(device)
    }

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=-1).cpu().numpy()
    all_preds.extend(predicted_labels)
    all_labels.extend(batch['label'].cpu().numpy())

baseline_accuracy = accuracy_score(all_labels, all_preds)
baseline_f1 = f1_score(all_labels, all_preds)

print(f" Baseline Accuracy: {baseline_accuracy:.4f}")
print(f" Baseline F1-Score: {baseline_f1:.4f}")

 Baseline Accuracy: 0.5014
 Baseline F1-Score: 0.0281


In [15]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,  # Load the best model based on F1 score
    metric_for_best_model="f1",   # Use F1 score to determine the best model
    logging_dir='./logs',
    logging_steps=50,
    report_to= "none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Optional eval during training
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.242,0.31074,0.8836,0.870782
2,0.1444,0.235155,0.9392,0.939321
3,0.1148,0.291182,0.941,0.94044


TrainOutput(global_step=3750, training_loss=0.17958637606302896, metrics={'train_runtime': 1346.4783, 'train_samples_per_second': 44.561, 'train_steps_per_second': 2.785, 'total_flos': 1433886105600000.0, 'train_loss': 0.17958637606302896, 'epoch': 3.0})

In [16]:
from transformers import AlbertTokenizer

model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/spiece.model',
 './sentiment_model/added_tokens.json')

In [17]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive("sentiment_model", 'zip', "./sentiment_model")

# Download the zip file
files.download("sentiment_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
# We are asked to show training instances and the data used:
from transformers import AlbertTokenizer
from random import sample

tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

# Sample and decode 5 examples
for example in sample(list(train_dataset), 5):
    input_ids = example['input_ids']
    label = example['label']

    if hasattr(input_ids, 'tolist'):
        input_ids = input_ids.tolist()

    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)

    print("Text:", decoded_text)
    print("Label:", "Positive" if label == 1 else "Negative")
    print("-----")


Text: birthday girl doesn't know what it wants to be - is it a comedy,, is it a drama...it just doesn't know. what could have been a very funny or touching film ends up in no-man's land. the premise is original enough to have warranted a script full of interesting scenarios but hardly delivers any and ends up petering out. this is a real shame if you look at the cast - it's very solid all the way through but they don't get the chance to shine. very disappointing.
Label: Negative
-----
Text: it's always nice to see angela bassett getting to do a role that she can really sink her teeth into. she is at times intense, funny and even sexy in her role as lena, a "colored" woman forced to make a home on a desolate mudbank just outside of cape town, south africa. danny glover is also good in a not entirely sympathetic role as her partner, boesman. willie jonah gives a finely nuanced performance as the stranger that discovers boesman and lena's new living area. it's not often that you get a cha