<a href="https://colab.research.google.com/github/adewale-codes/NLP/blob/main/NLP_coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing required libraries

In [None]:
!pip install datasets
!pip install evaluate

Importing required libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import evaluate

Loading the dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP/Reviews.csv")
print("Dataset columns:", df.columns)
print("First few rows:")
print(df.head())

Using the score column as the rating column

In [None]:
rating_column = 'Score'

Mapping the score column to sentiment lables setting 0 as negative, 1 as neutral and 2 as positive

In [None]:
def map_sentiment(score):
    if score >= 4:
        return 2
    elif score <= 2:
        return 0
    else:
        return 1

Creating a new column called label using the mapping function

In [None]:
df['label'] = df[rating_column].apply(map_sentiment)

Dropped rows with missing review text. The column name is called Text

In [None]:
df = df.dropna(subset=['Text'])

Split the dataset into training and testing sets

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

Converted the pandas DataFrames to Hugging Face Datasets

In [None]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

Initialized a BERT tokenizer

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Tokenized the dataset using the column called Text

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Set the format for PyTorch tensors

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Load a pre-trained BERT model for sequence classification with 3 labels negative, neutral, positive

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Define the evaluation metric which is accuracy using evaluate.load

In [None]:
accuracy_metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Defined training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[],
)

Initialized the Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Fine tuning the model

In [None]:
trainer.train()

Evaluating the model on the test dataset

In [None]:
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Tesing the model

In [None]:
import torch

example_review = "I absolutely hate this product, it below all my expectations!"
inputs = tokenizer(example_review, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)

predicted_class = int(np.argmax(outputs.logits.detach().cpu().numpy(), axis=-1)[0])

sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
print(f"Review: {example_review}")
print(f"Predicted Sentiment: {sentiment_mapping[predicted_class]}")

# Since the model took so long to run i saved the checkpoints so rather than running from beginning you can run from the third epoch and save time to. To do that i added these codes below

Importing the libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import evaluate
import torch

Loading the dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP/Reviews.csv")
print("Dataset columns:", df.columns)
print("First few rows:")
print(df.head())

Using the score column as the rating column

In [None]:
rating_column = 'Score'

Mapping the score column to sentiment lables setting 0 as negative, 1 as neutral and 2 as positive

In [None]:
def map_sentiment(score):
    if score >= 4:
        return 2
    elif score <= 2:
        return 0
    else:
        return 1

Splitting the dataset into training and testing sets

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

Tokenization and Formatting

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True, max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Model and Trainer Setup

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Defined evaluation metric using evaluate.load

In [None]:
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[],
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Checkpoint path

In [None]:
checkpoint_path = "/content/drive/MyDrive/results/checkpoint-85269"
trainer.train(resume_from_checkpoint=checkpoint_path)

Evaluating model after resuming training

In [None]:
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Load the model from a checkpoint without further training

In [None]:
model = BertForSequenceClassification.from_pretrained(checkpoint_path)

Testing the model

In [None]:
example_review = "I absolutely hate this product, it below all my expectations!"
inputs = tokenizer(example_review, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)
predicted_class = int(np.argmax(outputs.logits.detach().cpu().numpy(), axis=-1)[0])
sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
print(f"Review: {example_review}")
print(f"Predicted Sentiment: {sentiment_mapping[predicted_class]}")

Saving the model to create backend

In [None]:
trainer.save_model("./saved_model")

tokenizer.save_pretrained("./saved_model")
