# Dataset Labelling

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()


def get_vader_sentiment(text):
    
    # Get polarity scores
    scores = analyzer.polarity_scores(text)
    # Get the compound score
    compound = scores['compound']
    
    # 3. Set thresholds and classify
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'
    
df_vader = pd.read_csv('fate_merged.csv')
df_vader['sentiment_vader'] = df_vader['review'].apply(get_vader_sentiment)
df_vader.to_csv('fate_labelled_vader.csv', index=False)

In [1]:
import pandas as pd
from transformers import pipeline

classifier = pipeline("zero-shot-classification", 
                      model="facebook/bart-large-mnli",
                      device=0) # Change to device=-1 to force CPU

# 2. Define your candidate labels
candidate_labels = ['positive', 'negative', 'neutral']

df = pd.read_csv('fate_merged.csv')

# 3. Apply to your reviews
# Note: This can be slow if run one by one. It's much faster to pass a list.
reviews_list = df['review'].tolist()
results = classifier(reviews_list, candidate_labels)

df_labelled = pd.DataFrame(results)
df_labelled.to_csv('fate_labelled_bart.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0


# Preprocessing

In [27]:
def convert_sentiment_to_number(sentiment):
    
    if sentiment == 'negative':
        return 0
    elif sentiment == 'neutral':
        return 1
    else:
        return 2

In [None]:
import pandas as pd
import ast
from nltk.corpus import stopwords

df_bart = pd.read_csv('fate_labelled_bart.csv')
df_bart.head(1)

df_bart['label'] = df_bart['labels'].apply(lambda x: convert_sentiment_to_number(ast.literal_eval(x)[0]))
df_bart['confidence'] = df_bart['scores'].apply(lambda x: ast.literal_eval(x)[0])
df_bart.drop(columns=['labels', 'scores'])

Unnamed: 0,sequence,label,confidence
0,The game was good but sadly runs very poorly o...,0,0.837493
1,Account linking when? If there is a reason NA ...,0,0.879397
2,I played for a day then it stopped working. it...,0,0.958166
3,Played this a while back and already built a l...,0,0.756128
4,"First things first, I've been playing this gam...",0,0.969432
...,...,...,...
21306,&gt;Altria,2,0.560377
21307,Salt time,0,0.585777
21308,Rejoice,2,0.992136
21309,Love playing this game in Japanese. Hope Engli...,2,0.847753


In [None]:
from transformers import BertTokenizer

BERT_MODEL = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set 

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


In [30]:
# Store length of each review 
token_lens = []

# Iterate through the content slide
for txt in df_bart.sequence:
    tokens = tokenizer.encode(txt, max_length=512)
    token_lens.append(len(tokens))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [39]:
import torch
from torch.utils.data import Dataset

MAX_LEN = 512

class GPReviewDataset(Dataset):
    # Constructor Function 
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    # Length magic method
    def __len__(self):
        return len(self.reviews)
    
    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        # Encoded format to be returned 
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [40]:
from sklearn.model_selection import train_test_split
import numpy as np

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

df_train, df_test = train_test_split(df_bart, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print(df_train.shape, df_val.shape, df_test.shape)

(17048, 5) (2131, 5) (2132, 5)


In [41]:
from torch.utils.data import DataLoader

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.sequence.to_numpy(),
        targets=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [42]:
print(df_bart.sequence.to_numpy())

['The game was good but sadly runs very poorly on older devices making it almost unplayable.'
 "Account linking when? If there is a reason NA can't have it early, if we can ever have it, we need to know. Lasengle needs to put out a statement. They won't though since their community interaction/PR consists of scheduled social media posts, YouTube premieres which consistently show how ridiculously small their promotional budget is, and the occasional rushed livestream the two or three times a year they happen to have a panel at a Convention. I won't hold my breath."
 'I played for a day then it stopped working. it just shows a black screen after start up.'
 ... 'Rejoice'
 'Love playing this game in Japanese. Hope English version is just as good.'
 'PLAYED ON JP SERVER, BEEN WAITING SO LONG FOR THIS']


In [43]:
# Create train, test and val data loaders
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [44]:
# Examples 
data = next(iter(train_data_loader))
print(data.keys())

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

dict_keys(['review_text', 'input_ids', 'attention_mask', 'targets'])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16])


In [45]:
from transformers import BertModel


bert_model = BertModel.from_pretrained(BERT_MODEL)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [57]:
from torch import nn

# Build the Sentiment Classifier class 
class SentimentClassifier(nn.Module):
    
    # Constructor class 
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    # Forward propagaion class
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
        )
        #  Add a dropout layer 
        output = self.drop(pooled_output)
        return self.out(output)

In [64]:
class_names = ['negative', 'neutral', 'positive']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device used: {device}")

# Instantiate the model and move to classifier
model = SentimentClassifier(len(class_names))
model = model.to(device)

Device used: cuda


In [59]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW


# Number of iterations 
EPOCHS = 10

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

In [60]:
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [61]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [62]:
from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.4047917885545075 accuracy 0.8318864382918817
Val   loss 0.3298096629554656 accuracy 0.8700140778977006

Epoch 2/10
----------


KeyboardInterrupt: 

In [63]:
torch.cuda.is_available()

True