In [1]:
import pandas as pd
import numpy as np
import re
import string
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from google.colab import drive

In [2]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/thesis/english_reviews.csv', parse_dates=['date'])

In [4]:
df

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,2005-10-11,False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,2004-01-07,False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,2003-12-30,False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,2004-03-18,False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,2005-08-28,False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0
...,...,...,...,...,...,...,...,...
59850,B081H6STQQ,jande,5,2019-08-16,False,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...,1.0
59851,B081H6STQQ,2cool4u,5,2019-09-14,False,Simply Amazing!,I've been an Xperia user for several years and...,1.0
59852,B081H6STQQ,simon,5,2019-07-14,False,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin,0.0
59853,B081TJFVCJ,Tobiasz Jedrysiak,5,2019-12-24,True,Phone is like new,Product looks and works like new. Very much re...,0.0


In [5]:
# Ensure NLTK resources are downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Set of stopwords
stop_words = set(stopwords.words('english'))

In [7]:
# Preprocessing function
def preprocess_text(text):
    """
    Preprocesses the input text by converting it to lowercase, removing URLs, HTML tags, punctuation, numbers,
    and stopwords, and tokenizing the text.
    """
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [8]:
# Tokenize data function
def tokenize_data(tokenizer, texts):
    """
    Tokenizes the texts using the provided tokenizer.
    """
    encodings = tokenizer(texts, truncation=True, padding=True)
    return encodings

In [9]:
# SentimentDataset class
class SentimentDataset(torch.utils.data.Dataset):
    """
    Custom dataset class for handling tokenized inputs and corresponding labels.
    """
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        if self.labels is not None:
            return len(self.labels)
        return len(self.encodings['input_ids'])

In [11]:
# Preprocess the text
df['cleaned_body'] = df['body'].apply(preprocess_text)

In [12]:
# Convert ratings to zero-indexed labels
df['label'] = df['rating'] - 1

In [13]:
# Create lists of reviews and their corresponding labels
X = df['cleaned_body'].tolist()
y = df['label'].tolist()

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [15]:
# Load the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
# Tokenize the data
train_encodings = tokenize_data(tokenizer, X_train)
test_encodings = tokenize_data(tokenizer, X_test)

In [17]:
# Create the datasets
train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

In [18]:
pip install transformers[torch]



In [19]:
pip install accelerate -U



In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Use eval_strategy instead of evaluation_strategy
    save_strategy="epoch",  # Ensure save_strategy matches eval_strategy
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=True,  # Use mixed precision
    dataloader_num_workers=4,  # Use multiple workers for data loading
    gradient_accumulation_steps=2,  # Accumulate gradients to effectively use larger batch sizes
    learning_rate=2e-5,  # Set learning rate
    lr_scheduler_type='linear',  # Use linear learning rate scheduler
    optim='adamw_torch'  # Use AdamW optimizer
)

In [21]:
# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping callback
)

In [23]:
trainer.train()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
0,0.7572,0.791285
2,0.7184,0.743087
4,0.4922,0.802005


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


TrainOutput(global_step=7480, training_loss=0.683274338685255, metrics={'train_runtime': 6910.5975, 'train_samples_per_second': 34.645, 'train_steps_per_second': 1.082, 'total_flos': 6.297574868990362e+16, 'train_loss': 0.683274338685255, 'epoch': 4.998329435349148})

In [24]:
model_save_path = '/content/drive/My Drive/thesis'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/thesis/tokenizer_config.json',
 '/content/drive/My Drive/thesis/special_tokens_map.json',
 '/content/drive/My Drive/thesis/vocab.json',
 '/content/drive/My Drive/thesis/merges.txt',
 '/content/drive/My Drive/thesis/added_tokens.json',
 '/content/drive/My Drive/thesis/tokenizer.json')

In [25]:
# Tokenize the entire dataset
full_encodings = tokenize_data(tokenizer, df['cleaned_body'].tolist())

In [26]:
# Create a dataset for the entire dataset
full_dataset = SentimentDataset(full_encodings)

In [27]:
# Predict sentiment scores on the entire dataset
full_predictions = trainer.predict(full_dataset)

  self.pid = os.fork()


In [28]:
# Add the full predictions to the DataFrame
df['predicted_sentiment'] = np.argmax(full_predictions.predictions, axis=1) + 1
df['normalized_predicted_sentiment'] = (df['predicted_sentiment'] - 1) / 4.0

In [29]:
# Normalize ratings
df['normalized_rating'] = (df['rating'] - 1) / 4

In [31]:
# Calculate consistency scores for the entire dataset
df['consistency_score'] = 10 * (1 - np.abs(df['normalized_rating'] - df['normalized_predicted_sentiment']))

In [35]:
df[['rating','label','predicted_sentiment','normalized_predicted_sentiment','normalized_rating','consistency_score']]

Unnamed: 0,rating,label,predicted_sentiment,normalized_predicted_sentiment,normalized_rating,consistency_score
0,3,2,3,0.50,0.50,10.0
1,1,0,1,0.00,0.00,10.0
2,5,4,5,1.00,1.00,10.0
3,3,2,4,0.75,0.50,7.5
4,4,3,3,0.50,0.75,7.5
...,...,...,...,...,...,...
59850,5,4,5,1.00,1.00,10.0
59851,5,4,5,1.00,1.00,10.0
59852,5,4,5,1.00,1.00,10.0
59853,5,4,5,1.00,1.00,10.0


In [38]:
df.to_csv('/content/drive/My Drive/thesis/df_consistency_result.csv', index=False)