In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import random
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv', header=None, names=['Class', 'Review'])
test_df = pd.read_csv('/content/drive/MyDrive/test.csv', header=None, names=['Class', 'Review'])
test_df['Review'] = test_df['Review'].str.replace('""', '"')
test_df['Review'] = test_df['Review'].str.replace('\\n', '\n')
# Replace escape sequences in review text
train_df['Review'] = train_df['Review'].str.replace('""', '"')
train_df['Review'] = train_df['Review'].str.replace('\\n', '\n')
# Convert class index to negative and positive labels
train_df['Sentiment'] = train_df['Class'].apply(lambda x: 'Negative' if x == 1 else 'Positive')
train_df = train_df.drop('Class', axis=1)
test_df['Sentiment'] = test_df['Class'].apply(lambda x: 'Negative' if x == 1 else 'Positive')
test_df = test_df.drop('Class', axis=1)

training_val = 50000
testing_val = 2500
test_data = test_df[:testing_val]
X = train_df[:training_val].Review.values
y = train_df[:training_val].Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2020)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [10]:
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent, # Preprocess sentence
            add_special_tokens=True, # Add `[CLS]` and `[SEP]`
            max_length=256, # Max length to truncate/pad
            pad_to_max_length=True, # Pad sentence to max length
            return_tensors='pt', # Return PyTorch tensor
            return_attention_mask=True # Return attention mask
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    input_ids = torch.cat(input_ids)
    attention_masks = torch.cat(attention_masks)
    return input_ids, attention_masks

encoded_train_df = [tokenizer.encode(sent, add_special_tokens=True,max_length=256,pad_to_max_length=True) for sent in X]
max_len_train = max([len(sent) for sent in encoded_train_df])
encoded_test_df = [tokenizer.encode(sent, add_special_tokens=True,max_length=256,pad_to_max_length=True) for sent in test_df]
max_len_test = max([len(sent) for sent in encoded_test_df])
max_len = max(max_len_train,max_len_test)

token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

batch_size = 32
train_labels = torch.tensor(y_train_encoded)
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_labels = torch.tensor(y_val_encoded)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2
        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.classifier(last_hidden_state_cls)
        return logits

In [18]:
loss_fn = nn.CrossEntropyLoss()
bert_classifier = BertClassifier(freeze_bert=False)
bert_classifier.to(device)
optimizer = AdamW(bert_classifier.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = len(train_dataloader) * epochs)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9}")
        print("-" * 70)
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids, b_attn_mask)
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)
        print("-" * 70)
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f}")
            print("-" * 70)
            print("\n")

def evaluate(model, val_dataloader):
    model.eval()
    val_accuracy = []
    val_loss = []
    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    return val_loss, val_accuracy

In [None]:
epochs = 2
train(bert_classifier, train_dataloader, val_dataloader, epochs, evaluation=True)

In [21]:
def bert_predict(model, test_dataloader):
    model.eval()
    all_logits = []
    for batch in test_dataloader:
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    all_logits = torch.cat(all_logits, dim=0)
    probs = F.softmax(all_logits, dim=1).cpu().numpy()
    return probs

In [None]:
probs = bert_predict(bert_classifier, val_dataloader)
preds = probs[:, 1]
y_pred = np.where(preds >= 0.5, 1, 0)
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')

In [None]:
# Concatenate the train set and the validation set
full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=32)
# Train the Bert Classifier on the entire training data
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, full_train_dataloader, epochs=2)

In [None]:
test_inputs, test_masks = preprocessing_for_bert(test_data.Review)
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)
probs = bert_predict(bert_classifier, test_dataloader)
threshold = 0.9

preds = np.where(probs[:, 1] > threshold, 1, 0)
y_pred = np.argmax(preds)
y_true = test_data.Sentiment
y_true = label_encoder.fit_transform(y_true)
accuracy = accuracy_score(y_true, preds)
print(f"Accuracy: {accuracy:.2f}")
conf_matrix = confusion_matrix(y_true, preds)
print("Confusion Matrix:")
print(conf_matrix)