In [None]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import time

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Mobile Prioritazitation/Final_dataset.csv')
df.info()

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['sentiment'] = labelencoder.fit_transform(df['sentiment'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sentiment_categories = [1, 2, 3, 4, 5]

ax=sns.countplot(x='sentiment', data=df)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'),
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                xytext = (0, 6),
                textcoords = 'offset points')
# Setting x-tick labels to start from 1
ax.set_xticklabels(sentiment_categories)
# Adding x and y titles
plt.xlabel('Priority')
plt.ylabel('No. of Reviews')

plt.show()

In [None]:
pip install transformers

In [None]:
# Define the hyperparameters
learning_rate = 2e-5
batch_size = 16
epochs = 16
num_labels = 5
attention_heads = 8
gradient_accumulation_steps = 16
hidden_size = 768
hidden_layers = 6
max_seq_length = 256
num_params = 110_000_000

In [None]:
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import RandomSampler
from torch.utils.data import DataLoader

In [None]:
# Assuming df is your DataFrame and 'column1' and 'column2' are the column names you want to join
df['review'] = df['title'].fillna('') + ' ' + df['body'].fillna('')

In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize the input data using the BERT tokenizer for the training set
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_input_ids = []
train_attention_masks = []
for _, row in train_df.iterrows():
    sentence = row['review']
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens=True,
                        max_length=max_seq_length,
                        pad_to_max_length=True,
                        return_attention_mask=True,
                        return_tensors='pt'
                   )
    print("iteration")
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)

# Convert labels to a 1D numpy array for the training set
train_labels = np.array(train_df['sentiment'])
# Convert labels to a tensor for the training set
train_labels = torch.tensor(train_labels)

In [None]:
unique_labels = torch.unique(train_labels)
label_counts = torch.bincount(train_labels)
for label in unique_labels:
    count = label_counts[label]
    print(f"Label {label}: {count} occurrences")


In [None]:
with open("/content/drive/MyDrive/Mobile Prioritazitation/train_input_ids.pickle", "wb") as scores:
    pickle.dump(train_input_ids, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/train_attention_masks.pickle", "wb") as scores:
    pickle.dump(train_attention_masks, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/train_labels.pickle", "wb") as scores:
    pickle.dump(train_labels, scores)

In [None]:
with open("/content/drive/MyDrive/Mobile Prioritazitation/train_input_ids.pickle", "rb") as scores:
   train_input_ids = pickle.load(scores)

with open("/content/drive/MyDrive/Mobile Prioritazitation/train_attention_masks.pickle", "rb") as scores:
   train_attention_masks= pickle.load(scores)

with open("/content/drive/MyDrive/Mobile Prioritazitation/train_labels.pickle", "rb") as scores:
   train_labels = pickle.load(scores)

In [None]:
print(train_input_ids.shape)
print(train_attention_masks.shape)
print(train_labels.shape)

In [None]:
# Create the training dataset and data loader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
print(train_dataset)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [None]:
# Tokenize the input data using the BERT tokenizer for testing set
from torch.utils.data import SequentialSampler

In [None]:
test_input_ids = []
test_attention_masks = []
for _, row in test_df.iterrows():
    sentence = row['review']
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens=True,
                        max_length=max_seq_length,
                        pad_to_max_length=True,
                        return_attention_mask=True,
                        return_tensors='pt'
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

# Convert labels to a 1D numpy array for the training set
test_labels = np.array(test_df['sentiment'])
# Convert labels to a tensor for the training set
test_labels = torch.tensor(test_labels)

In [None]:

with open("/content/drive/MyDrive/Mobile Prioritazitation/test_input_ids.pickle", "wb") as scores:
    pickle.dump(test_input_ids, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/test_attention_masks.pickle", "wb") as scores:
    pickle.dump(test_attention_masks, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/test_labels.pickle", "wb") as scores:
    pickle.dump(test_labels, scores)

In [None]:
with open("/content/drive/MyDrive/Mobile Prioritazitation/test_input_ids.pickle", "rb") as scores:
   test_input_ids = pickle.load(scores)

with open("/content/drive/MyDrive/Mobile Prioritazitation/test_attention_masks.pickle", "rb") as scores:
   test_attention_masks = pickle.load(scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/test_labels.pickle", "rb") as scores:
  test_labels = pickle.load(scores)

In [None]:
print(test_input_ids.shape)
print(test_attention_masks.shape)
print(test_labels.shape)

In [None]:
# Create the testing dataset and data loader
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [None]:
from torch.utils.data import TensorDataset
from torch.utils.data import RandomSampler
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers.optimization import get_linear_schedule_with_warmup

In [None]:
val_input_ids = []
val_attention_masks = []
for _, row in df.iterrows():
    sentence = row['review']
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens=True,
                        max_length=max_seq_length,
                        pad_to_max_length=True,
                        return_attention_mask=True,
                        return_tensors='pt'
                   )
    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)

# Convert labels to a 1D numpy array for the training set
val_labels = np.array(df['sentiment'])
# Convert labels to a tensor for the training set
val_labels = torch.tensor(val_labels)

In [None]:
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_input_ids.pickle", "wb") as scores:
    pickle.dump(val_input_ids, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_attention_masks.pickle", "wb") as scores:
    pickle.dump(val_attention_masks, scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_labels.pickle", "wb") as scores:
    pickle.dump(val_labels, scores)

In [None]:
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_input_ids.pickle", "rb") as scores:
   val_input_ids = pickle.load(scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_attention_masks.pickle", "rb") as scores:
   val_attention_masks = pickle.load(scores)
with open("/content/drive/MyDrive/Mobile Prioritazitation/val_labels.pickle", "rb") as scores:
  val_labels = pickle.load(scores)

In [None]:
print(val_input_ids.shape)
print(val_attention_masks.shape)
print(val_labels.shape)

In [None]:
# Create the validation dataset and data loader
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

In [None]:
import torch.nn as nn

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Move the model to the GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)

# Set the number of epochs, the training and validation steps, and the loss function
epochs = 16
total_steps = len(train_dataloader) * epochs
train_steps = len(train_dataloader)
eval_steps = len(val_dataloader)
loss_fn = nn.CrossEntropyLoss()

In [None]:
x = list(range(1, epochs+1))
losses = []  # Track loss values
accuracies = []  # Track accuracy values
# Initialize the timer for training
start_time = time.time()
# Train the model
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    epoch_correct = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        epoch_loss += loss.item()
        epoch_correct += torch.sum(torch.argmax(logits, dim=1) == labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        if (step + 1) % train_steps == 0:
            epoch_loss /= train_steps
            epoch_acc = epoch_correct / (train_steps * batch_size)
            print(f'Epoch [{epoch+1}/{epochs}], Step [{step+1}/{train_steps}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
            # Calculate epoch loss and accuracy
            epoch_loss /= train_steps
            epoch_acc = epoch_correct / (train_steps * batch_size)

            # Store the metric values
            losses.append(epoch_loss)
            accuracies.append(epoch_acc)
            epoch_loss = 0
            epoch_correct = 0

# Calculate the training time
training_time = time.time() - start_time

# Print the training time
print("Training Time: {:.4f} seconds".format(training_time))

In [None]:
# Initialize the figure and axes
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# Update the plots
ax[0].plot(x[:len(losses)], losses,  color="red", label="Loss")
ax[1].plot(x[:len(accuracies)], [acc.cpu().detach().item() for acc in accuracies],  color="blue", label="Accuracy")


# Set plot labels and titles
ax[0].set_xlabel("Epoch")
ax[0].set_ylabel(" Training Loss")
# ax[0].set_title("Training Loss")
ax[0].legend()

ax[1].set_xlabel("Epoch")
ax[1].set_ylabel("Training Accuracy")
# ax[1].set_title("Training Accuracy")
ax[1].legend()

# Display the plots
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have two lists 'losses' and 'accuracies' containing the loss and accuracy values, respectively.

x = range(1, len(losses) + 1)  # Assuming 'x' represents the epochs

# Initialize the figure and axis
fig, ax = plt.subplots(figsize=(8, 4))

# Update the plots
ax.plot(x[:len(losses)], losses,  color="red", label="Loss")
ax.plot(x[:len(accuracies)], [acc.cpu().detach().item() for acc in accuracies],  color="blue", label="Accuracy")

# Set plot labels and titles
ax.set_xlabel("Epoch")
ax.set_ylabel("Training Accuracy and Loss Value")
# ax.set_title("Training Loss and Accuracy")

# Add a legend to differentiate loss and accuracy lines
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Evaluate the model on the test dataset
model.eval()
predictions = []
true_labels = []
eval_loss = 0
eval_correct = 0


In [None]:
import matplotlib.pyplot as plt

# Initialize lists to store accuracy and loss values
iteration_accuracy = []
iteration_loss = []

# Evaluate the model on the test dataset
model.eval()
predictions = []
true_labels = []
eval_loss = 0
eval_correct = 0
# Initialize the timer for testing
start_time = time.time()

# Evaluation of Model
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        eval_loss += loss.item()
        eval_correct += torch.sum(torch.argmax(logits, dim=1) == labels)

        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

        # Store accuracy and loss values
        iteration_accuracy.append(accuracy_score(true_labels, predictions))
        iteration_loss.append(eval_loss / (step + 1))

# Calculate the testing time
testing_time = time.time() - start_time

# Compute the accuracy, precision, recall, and F1 score
acc = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

# Print the testing time
print("Testing Time: {:.2f} seconds".format(testing_time))

In [None]:
import matplotlib.pyplot as plt

# Assuming you have two lists 'iteration_accuracy' and 'iteration_loss' containing the accuracy and loss values on each iteration, respectively.

iterations = range(1, len(iteration_accuracy) + 1)

# Initialize the figure and axes
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Update the accuracy subplot
line_accuracy, = ax[0].plot(iterations, iteration_accuracy, label='Testing Accuracy')
ax[0].set_xlabel("Iteration")
ax[0].set_ylabel("Testing Accuracy")
# ax[0].set_title("Testing Accuracy on Each Iteration")

# Update the loss subplot
line_loss, = ax[1].plot(iterations, iteration_loss, color='red', label='Testing Loss')
ax[1].set_xlabel("Iteration")
ax[1].set_ylabel("Testing Loss")
# ax[1].set_title("Testing Loss on Each Iteration")

# Add legend to both subplots
ax[0].legend(handles=[line_accuracy], loc='upper right')
ax[1].legend(handles=[line_loss], loc='upper left')

# Display the plots
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have two lists 'iteration_accuracy' and 'iteration_loss' containing the accuracy and loss values on each iteration, respectively.

iterations = range(1, len(iteration_accuracy) + 1)

# Initialize the figure and axis
fig, ax = plt.subplots(figsize=(8, 4))

# Update the accuracy subplot
ax.plot(iterations, iteration_accuracy, label='Testing Accuracy', color='blue')
ax.set_xlabel("Iteration")
# ax.set_ylabel("Accuracy")
# ax.set_title("Accuracy and Loss on Each Iteration")

# Update the loss subplot
ax.plot(iterations, iteration_loss, label='Testing Loss', color='red')
ax.set_xlabel("Iteration")
ax.set_ylabel("Accuracy and Loss Value")

# Add a legend to differentiate accuracy and loss lines
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Print the metrics
# print(f'Test Loss: {eval_loss:.4f}, Test Accuracy: {eval_acc:.4f}')
print(f'Test Accuracy: {acc:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1 Score: {f1:.4f}')

from sklearn.metrics import confusion_matrix

# y_pred and y_true are assumed to be numpy arrays or lists
confusion = confusion_matrix(true_labels, predictions)

# Print the confusion matrix
print(confusion)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report


# Print classification report for ensemble model
target_names = ['1 Star', '2 Star', '3 Star','4 Star','5 Star']
print(classification_report(true_labels, predictions, target_names=target_names))

cm = confusion_matrix(true_labels, predictions)
print("Confusion matrix:\n", cm)
# corr, linewidths=.3, cmap="RdBu", annot=True, fmt=""
# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='g', linewidths=.2, cmap='RdBu')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Macro metrics
precision_macro = precision_score(true_labels, predictions, average='macro')
recall_macro = recall_score(true_labels, predictions, average='macro')
f1_macro = f1_score(true_labels, predictions, average='macro')

# Micro metrics
precision_micro = precision_score(true_labels, predictions, average='micro')
recall_micro = recall_score(true_labels, predictions, average='micro')
f1_micro = f1_score(true_labels, predictions, average='micro')

# Print metrics
print(f'Accuracy: {acc}')
print('Macro Metrics:')
print(f'Precision: {precision_macro}')
print(f'Recall: {recall_macro}')
print(f'F1-Score: {f1_macro}')

print('Micro Metrics:')
print(f'Precision: {precision_micro}')
print(f'Recall: {recall_micro}')
print(f'F1-Score: {f1_micro}')

In [None]:
# Assuming y_test contains the true labels and y_pred contains the predicted labels

# Calculate classification report
target_names = ['1', '2', '3', '4', '5']
report = classification_report(true_labels, predictions, target_names=target_names, output_dict=True)

# Calculate accuracy separately
accuracy = accuracy_score(true_labels, predictions)

# Print precision, recall, f1-score, and accuracy for each class
for label, metrics in report.items():
    if label != 'accuracy':
        class_accuracy = metrics["support"] * metrics["recall"]
        print(f'Class: {label}')
        print(f'Class Accuracy: {class_accuracy}')
        print(f'Precision: {metrics["precision"]}')
        print(f'Recall: {metrics["recall"]}')
        print(f'F1-score: {metrics["f1-score"]}')
        print(f'Support: {metrics["support"]}')


# Print overall accuracy
print(f'Accuracy: {accuracy}')