In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict
from textwrap import wrap
from transformers import  AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

pdf_src = '../IMDB-Dataset-GoogleTranslate-proccessed-nefnir.csv'

df = pd.read_csv(pdf_src)
df.drop(['Unnamed: 0'], axis=1, inplace=True)

def convert(sentiment):
    return 1 if sentiment == 'positive' else 0

df['sentiment'] = df.sentiment.apply(convert)
df['review'] = df.review.apply(lambda x: x.replace("_NEG", ""))
df['review']
# df = df[0:100]

df_train, df_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print(df_train.shape, df_val.shape, df_test.shape)

(35000, 2) (7500, 2) (7500, 2)


In [5]:
from transformers import RobertaModel
MODEL_NAME = 'mideind/IceBERT'
model = RobertaModel.from_pretrained(MODEL_NAME)


# Build the Sentiment Classifier class 
class SentimentClassifier(nn.Module):
    
    # Constructor class 
    def __init__(self, n_classes, dropout=0.3):
        super(SentimentClassifier, self).__init__()
        
        self.bert = RobertaModel.from_pretrained(MODEL_NAME)
        
        # The dimensionality of the output from RoBERTa (or most transformers) is defined by hidden_size.
        hidden_size = self.bert.config.hidden_size
        
        # Adding an additional dense layer
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        
        # Batch normalization layer
        self.bn1 = nn.BatchNorm1d(hidden_size)
        
        # Second dense layer (can be thought of as the main classification layer)
        self.fc2 = nn.Linear(hidden_size, n_classes)
        
        # Dropout and Activation
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()  # Using GELU activation function
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        
        # Pass through the additional dense layer
        x = self.fc1(pooled_output)
        x = self.bn1(x)
        x = self.activation(x)
        x = self.dropout(x)
        
        # Classification layer
        x = self.fc2(x)
        return x

class GPReviewDataset(Dataset):
    # Constructor Function 
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    # Length magic method
    def __len__(self):
        return len(self.reviews)
    
    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        # Encoded format to be returned 
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.review.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

class_names = ['negative', 'positive']

MAX_LEN = 512

# Create train, test and val data loaders
BATCH_SIZE = 8

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, truncation=True)
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# Instantiate the model and move to classifier
model = SentimentClassifier(len(class_names))
model = model.to(device)

EPOCHS = 20

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

Some weights of RobertaModel were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/20
----------




Train loss 0.47964975738631827 accuracy 0.7930285714285714
Val   loss 0.35038818171156494 accuracy 0.862

Epoch 2/20
----------
Train loss 0.3961344937263855 accuracy 0.8755428571428572
Val   loss 0.3825044009001997 accuracy 0.8776

Epoch 3/20
----------
Train loss 0.337473666373321 accuracy 0.9006857142857143
Val   loss 0.3978350280532375 accuracy 0.8834666666666667

Epoch 4/20
----------
Train loss 0.2970845261532281 accuracy 0.9220857142857143
Val   loss 0.5094548761409008 accuracy 0.8873333333333334

Epoch 5/20
----------
Train loss 0.2590922630402659 accuracy 0.9384571428571429
Val   loss 0.6090223911123525 accuracy 0.8877333333333334

Epoch 6/20
----------
Train loss 0.25600300157457323 accuracy 0.9411142857142857
Val   loss 0.6371232640834514 accuracy 0.8770666666666667

Epoch 7/20
----------
Train loss 0.20773192472390364 accuracy 0.9588857142857142
Val   loss 0.8415468631335972 accuracy 0.882

Epoch 8/20
----------
Train loss 0.17354056268952223 accuracy 0.9668
Val   loss 0.85

In [7]:
model.load_state_dict(torch.load("best_model_state.bin", map_location=torch.device('cpu')))

def predict(txt):
    encoded_review = tokenizer.encode_plus(
        txt,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    print(output)
    print(f'Review text: {txt}')
    print(f'Sentiment  : {class_names[prediction]}')

predict("ég hata þessa mynd")
predict("eg elska þessa mynd")

def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outouts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)


print(classification_report(y_test, y_pred, target_names=class_names))

tensor([[-7.6588,  7.4040]], device='cuda:0', grad_fn=<AddmmBackward0>)
Review text: ég hata þessa mynd
Sentiment  : positive
tensor([[-7.1974,  6.9845]], device='cuda:0', grad_fn=<AddmmBackward0>)
Review text: eg elska þessa mynd
Sentiment  : positive
              precision    recall  f1-score   support

    negative       0.89      0.90      0.89      3763
    positive       0.89      0.89      0.89      3737

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500



In [7]:
def convert_example_to_feature(review):
    return tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True
    )

def map_example_to_dict(input_ids, attention_masks, label):
  return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
  }, label

def encode_examples(ds):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  attention_mask_list = []
  label_list = []
  for review, label in ds.to_numpy():
    bert_input = convert_example_to_feature(review)
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, label_list)).map(map_example_to_dict)

from transformers import TFRobertaForSequenceClassification

import tensorflow as tf
learning_rate = 2e-5
ds_train_encoded = encode_examples(df_train).shuffle(10).batch(6)
ds_test_encoded = encode_examples(df_test).batch(6)

model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.fit(ds_train_encoded, epochs=EPOCHS, validation_data=ds_test_encoded)



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f4ea955bd90>

In [17]:


test_sentence = "þetta er mjög skemmtileg mynd"

predict_input = tokenizer.encode(test_sentence,truncation=True,padding=True,return_tensors="tf")
tf_output = model.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
print(tf_prediction)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])


tf.Tensor([[0.50161415 0.49838588]], shape=(1, 2), dtype=float32)
Negative
