In [8]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Import necessary libraries
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
#
# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
#from tqdm import tqdm as tqdm
# Misc.
import warnings
warnings.filterwarnings('ignore')

import torch
torch.cuda.empty_cache()
import gc
#del variables
gc.collect()

a = "../IMDB-Dataset-MideindTranslate-Processed.csv"
b = "../IMDB-Dataset-GoogleTranslate-Processed.csv"
c = "../IMDB-Dataset-Processed.csv"

df = pd.read_csv(a)
def convert(sentiment):
    if sentiment == 'positive':
        return 1
    return 0

df['sentiment'] = df.sentiment.apply(convert)
review, sentiment = df["review"].values.astype('U')[0:50], df["sentiment"][0:50]

x_train, x_test, y_train, y_test = train_test_split(
    review, sentiment, test_size=0.33, random_state=42
)

# Set intial variables and constants
%config InlineBackend.figure_format='retina'


# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

print(device)
print(df.shape)
print(df.head())


# Set the model name
MODEL_NAME = 'mideind/IceBERT'

# Build a BERT based tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, truncation=True)


cpu
(50000, 3)
   Unnamed: 0                                             review  sentiment
0           0  gagnrýnendunum hafa nefndur hafa horft bara oz...          1
1           1  dásamlegur lítill framleiðsla kvikmyndatækni m...          1
2           2  finna dásamlegur líða eyða tíma heitri sumarhe...          1
3           3  eiginlega fjölskylda lítill strákur jake vera ...          0
4           4  ást tíma peninga petter mattei sjónrænn glæsil...          1


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/595k [00:00<?, ?B/s]

In [10]:
# Load the basic BERT model 
bert_model = RobertaModel.from_pretrained(MODEL_NAME)

# Build the Sentiment Classifier class 
class SentimentClassifier(nn.Module):
    
    # Constructor class 
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = RobertaModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    # Forward propagaion class
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
          return_dict=False
        )
        #  Add a dropout layer 
        output = self.drop(pooled_output)
        return self.out(output)
    
class_names = ['negative', 'positive']

MAX_LEN = 512

class GPReviewDataset(Dataset):
    # Constructor Function 
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    # Length magic method
    def __len__(self):
        return len(self.reviews)
    
    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        # Encoded format to be returned 
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

df_train = df_train[0:50]
df_test = df_test[0:50]
df_val = df_val[0:50]
df_test = df_test[0:50]

print(df_train.shape, df_val.shape, df_test.shape)



def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.review.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )



# Create train, test and val data loaders
BATCH_SIZE = 8
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# Instantiate the model and move to classifier
model = SentimentClassifier(len(class_names))
model = model.to(device)
# Number of hidden units
print(bert_model.config.hidden_size)
# Number of iterations 
EPOCHS = 10

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

Some weights of RobertaModel were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(50, 3) (50, 3) (50, 3)


Some weights of RobertaModel were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


768


In [11]:
%%time



history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/10
----------
Train loss 0.7483303887503487 accuracy 0.34
Val   loss 0.6735145790236337 accuracy 0.74

Epoch 2/10
----------
Train loss 0.6936612725257874 accuracy 0.5
Val   loss 0.644334214074271 accuracy 0.68

Epoch 3/10
----------
Train loss 0.6233732785497393 accuracy 0.62
Val   loss 0.6997210213116237 accuracy 0.5

Epoch 4/10
----------
Train loss 0.5775763945920127 accuracy 0.78
Val   loss 0.7649777701922825 accuracy 0.5

Epoch 5/10
----------
Train loss 0.5241912603378296 accuracy 0.78
Val   loss 0.7044559078557151 accuracy 0.58

Epoch 6/10
----------
Train loss 0.33086344812597546 accuracy 0.86
Val   loss 0.7828562131949833 accuracy 0.54

Epoch 7/10
----------
Train loss 0.2075540923646518 accuracy 0.94
Val   loss 0.923834102494376 accuracy 0.6

Epoch 8/10
----------
Train loss 0.10265254974365234 accuracy 0.98
Val   loss 1.1798559938158308 accuracy 0.6

Epoch 9/10
----------
Train loss 0.08359540386923722 accuracy 0.98
Val   loss 1.4061129902090346 accuracy 0.58

Epoch