In [None]:
# Install required libraries
%pip install datasets
%pip install huggingface
%pip install evaluate

In [None]:
# Import required libraries

import os
import pandas as pd
import evaluate

from datasets import load_dataset
from transformers import GPT2Tokenizer
from transformers import GPT2ForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np

In [None]:
# Skip WanDB Integration - used for logging
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load dataset
dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(dataset['train'])

In [None]:
# Show the dataset
df.head()

In [None]:
# Tokenize the prompt using the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Replace the pad_token with eos_token
tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize each examples
def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenized dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Split dataset into TRAIN and EVAL partition
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
# We select a small fragment of the data for illustration due to resource limitations

small_train_dataset = small_train_dataset.select(range(100))
small_eval_dataset = small_eval_dataset.select(range(100))

In [None]:
# Load the model
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)

In [None]:
# Define the metric for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [None]:
# Training parameters and objects

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4,
   report_to=None
   )

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,

)

In [None]:
# Check the model performance before fine-tuning
trainer.evaluate()

In [None]:
# Train the model
trainer.train()

In [None]:
# Check the model performance after fine-tuning
trainer.evaluate()