In [3]:
import pandas as pd
from transformers import RobertaTokenizer

# Load the data into a pandas dataframe
df = pd.read_csv('Reviews(13).csv')


## Correcting the spelling of one misspelled label
df['Label'] = df['Label'].str.replace('negative', 'Negative')
df['Label'] = df['Label'].str.replace('neutral', 'Neutral')
df['Label'] = df['Label'].str.replace('positive', 'Positive')
df['Label'] = df['Label'].str.replace('Netural', 'Neutral')
df['Label'] = df['Label'].str.replace(' Neutral', 'Neutral')

df['Label'].unique()

df['Label'].value_counts()

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text and convert it to numerical vectors
encoded_texts = [tokenizer.encode(text, max_length=512, truncation=True) for text in df['review']]

# Pad the sequences to a uniform length
padded_texts = [text + [0]*(512-len(text)) for text in encoded_texts]

# Convert the labels to numerical values
label_map = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
labels = [label_map[label] for label in df['Label']]


# Splitting the data


In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(padded_texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


# Train the RoBERTa model:
 

In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaForSequenceClassification, AdamW

# Initialize the RoBERTa model and the classification layer
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Define the training parameters
batch_size = 16
epochs = 5
learning_rate = 2e-5

# Convert the data into PyTorch tensors and create a DataLoader
train_dataset = TensorDataset(torch.tensor(train_texts), torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the optimizer and the loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs)[0]
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

# Train the RoBERTa model


In [None]:
# import torch
# from torch.utils.data import TensorDataset, DataLoader
# from transformers import RobertaForSequenceClassification, AdamW

# # Initialize the RoBERTa model and the classification layer
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# # Define the training parameters
# batch_size = 16
# epochs = 5
# learning_rate = 2e-5

# # Convert the data into PyTorch tensors and create a DataLoader
# train_dataset = TensorDataset(torch.tensor(train_texts), torch.tensor(train_labels))
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# # Initialize the optimizer and the loss function
# optimizer = AdamW(model.parameters(), lr=learning_rate)
# loss_fn = torch.nn.CrossEntropyLoss()

# # Train the model
# for epoch in range(epochs):
#     for batch in train_loader:
#         optimizer.zero_grad()
#         inputs, labels = batch
#         outputs = model(inputs)[0]
#         loss = loss_fn(outputs, labels)
#         loss.backward()
#         optimizer.step()


# Evaluate the RoBERTa model

In [7]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Convert the validation data into PyTorch tensors and create a DataLoader
val_dataset = TensorDataset(torch.tensor(val_texts), torch.tensor(val_labels))
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Evaluate the model on the validation set
with torch.no_grad():
    val_preds = []
    val_true = []
    for batch in val_loader:
        inputs, labels = batch
        outputs = model(inputs)[0]
        _, preds = torch.max(outputs, dim=1)
        val_preds.extend(preds.tolist())
        val_true.extend(labels.tolist())

# Compute the evaluation metrics
val_acc = accuracy_score(val_true, val_preds)
val_f1 = f1_score(val_true, val_preds, average='macro')
val_report = classification_report(val_true, val_preds)


In [8]:
print (val_acc)
print (val_f1)
print (val_report)

0.9450127877237852
0.9242055590859263
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1908
           1       0.95      0.79      0.86       570
           2       0.90      0.99      0.94      1432

    accuracy                           0.95      3910
   macro avg       0.94      0.91      0.92      3910
weighted avg       0.95      0.95      0.94      3910



# Get predictions

In [13]:
from transformers import pipeline

# Define the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Define some sample texts to classify
sample_texts = [
    "The doctor is amazing!",
    "The video consultation service was terrible.",
    "Average experience tha"
]

# Classify the sample texts
predictions = classifier(sample_texts, truncation=True)

# Print the predictions
for prediction in predictions:
    label = prediction['label']
    score = prediction['score']
    print(f"{label}: {score}")


LABEL_0: 0.942875862121582
LABEL_2: 0.7678003907203674
LABEL_1: 0.7025073170661926
