In [37]:
import torch
import torch.nn as nn
from torch. optim import AdamW
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from torch.utils.data import DataLoader, TensorDataset, random_split, SequentialSampler
import pandas as pd
import numpy as np
from data_preprocessing import *

In [5]:
data = pd.read_excel(open('Consolidated.xlsx', 'rb'), sheet_name='Consolidated')
data

Unnamed: 0,S.No,Filepath,Foldername,Filename,Hotel,Review,Target
0,1,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_1.txt,hilton,We stay at Hilton for 4 nights last march. It ...,True Positive
1,2,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_10.txt,hilton,This is a stunning hotel in an excellent locat...,True Positive
2,3,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_11.txt,hilton,Staying at this hotel was one of the high poin...,True Positive
3,4,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_12.txt,hilton,"went to chicago for a week in may, decided to ...",True Positive
4,5,positive_polarity\truthful_from_TripAdvisor,fold1,t_hilton_13.txt,hilton,We stayed here from Nov. 30 to Dec 2 and had a...,True Positive
...,...,...,...,...,...,...,...
3195,3196,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_5.txt,palmer,"Overall, the hotel was okay. Though I did have...",False Negative
3196,3197,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_6.txt,palmer,I was not pleased with my recent stay at the P...,False Negative
3197,3198,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_7.txt,palmer,Our visit started off on the wrong foot when w...,False Negative
3198,3199,negative_polarity\deceptive_from_MTurk,fold5,d_palmer_8.txt,palmer,"Though grand and having a brand, this hotel se...",False Negative


In [7]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
reviews = data["Review"].tolist()
labels = data['Target'].apply(lambda x:1 if x==1 else 0).tolist()

encoded_data = tokenizer(reviews, padding= True, truncation= True, return_tensors='pt', max_length=128)
input_ids= encoded_data['input_ids']
attention_mask= encoded_data['attention_mask']
labels = torch.tensor(labels)



dataset = TensorDataset(input_ids, attention_mask, labels)

In [41]:
# split the dataset into training and testing set 
demo_size = int(0.1 * len(dataset))

train_size = int(0.7 * len(dataset))
test_size = int(0.2 * len(dataset))
demo_dataset, train_dataset, test_dataset = random_split(dataset, [demo_size, train_size, test_size])

In [42]:
#Create dataloader for training and testing set with automatic padding.test_size

train_dataloader = DataLoader(train_dataset, batch_size=8, sampler=SequentialSampler(train_dataset)) 
test_dataloader = DataLoader(test_dataset, batch_size=8, sampler=SequentialSampler(test_dataset)) 

# Define the optimiser 
optimizer = AdamW(model.parameters(), lr=5e-5)

In [44]:
# training loop with gradiend accumulation

num_epochs = 15
accumulation_steps = 4
for epoch in range(num_epochs):
    model.train()
    total_loss = 0 
    for step, batch in enumerate(train_dataloader):
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1],
                'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
    
    if (step+1 )% accumulation_steps ==  0:
        optimizer.step()
        optimizer.zero_grad()


In [None]:
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for batch in test_dataloader:
        inputs = {'input_ids': batch[0], 'attention_mask': batch [1], 'labels':batch [2]}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        total_correct += (predictions == batch[2]).sum().item()
        total_samples += batch[2].size(0)
        accuracy = total_correct / total_samples
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch[2].cpu().numpy())
# Print confusion matrix
cm = confusion_matrix(all_labels, all_predictions)
print(f'Epoch {epoch + 1}/{num_epochs}, Confusion Matrix:')
print (cm)
# Print Accuracy and Loss
accuracy = total_correct / total_samples
print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}, Test Accuracy: {accuracy * 100:.2f}%')

Epoch 15/15, Confusion Matrix:
[[640]]
Epoch 15/15, Loss: 2.995848383754492, Test Accuracy: 100.00%
