# 1. Imports

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


# 2. Data 

In [2]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
df_train = pd.read_csv("twitter_training.csv", names=['Tweet ID', 'Borderlands entity', 'Sentiment', 'Tweet content'])
df_dev = pd.read_csv("twitter_validation.csv", names=['Tweet ID', 'Borderlands entity', 'Sentiment', 'Tweet content'])

In [4]:
df_train

Unnamed: 0,Tweet ID,Borderlands entity,Sentiment,Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
df_dev

Unnamed: 0,Tweet ID,Borderlands entity,Sentiment,Tweet content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [6]:
df_train = df_train[['Tweet content','Sentiment']]
df_dev = df_dev[['Tweet content','Sentiment']]

In [7]:
df_train.Sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [8]:
df_dev.Sentiment.unique()

array(['Irrelevant', 'Neutral', 'Negative', 'Positive'], dtype=object)

In [9]:
df1=len(df_train.Sentiment.unique())
df1

4

In [10]:
df2=len(df_dev.Sentiment.unique())
df2

4

In [11]:
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant': 3}
df_train.loc[:, 'Sentiment'] = df_train['Sentiment'].map(label_mapping)
df_dev.loc[:, 'Sentiment'] = df_dev['Sentiment'].map(label_mapping)

In [12]:
df_dev.Sentiment.unique()

array([3, 1, 2, 0], dtype=object)

In [13]:
df_train.Sentiment.unique()

array([0, 1, 2, 3], dtype=object)

In [14]:
df_train

Unnamed: 0,Tweet content,Sentiment
0,im getting on borderlands and i will murder yo...,0
1,I am coming to the borders and I will kill you...,0
2,im getting on borderlands and i will kill you ...,0
3,im coming on borderlands and i will murder you...,0
4,im getting on borderlands 2 and i will murder ...,0
...,...,...
74677,Just realized that the Windows partition of my...,0
74678,Just realized that my Mac window partition is ...,0
74679,Just realized the windows partition of my Mac ...,0
74680,Just realized between the windows partition of...,0


# 3.Model and tokenizer

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_mapping))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4 .Tokenization

In [16]:
def tokenize_data(data, max_length):
    tweet_contents = data['Tweet content'].astype(str).tolist()
    print(f"Type of 'Tweet content': {type(tweet_contents)}")
    assert all(isinstance(tweet, str) for tweet in tweet_contents), "All entries in 'Tweet content' should be strings."
    
    return tokenizer(tweet_contents, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

In [17]:
max_length = 128
train_data = tokenize_data(df_train, max_length)
dev_data = tokenize_data(df_dev, max_length)


Type of 'Tweet content': <class 'list'>
Type of 'Tweet content': <class 'list'>


# 5. Frezzing layers

In [18]:
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.0."): 
        param.requires_grad = False
        
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.1."): 
        param.requires_grad = False
        
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.2."): 
        param.requires_grad = False
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.3."): 
        param.requires_grad = False
        
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.8."): 
        param.requires_grad = False

for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.9."): 
        param.requires_grad = False        

for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.10."): 
        param.requires_grad = False       
        
for name, param in model.named_parameters():
     if name.startswith("bert.encoder.layer.11."): 
        param.requires_grad = False               
                  

In [19]:
for name, param in model.named_parameters():
     print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.LayerNor

# 6  .Dataset

In [20]:
train_labels = torch.tensor(df_train['Sentiment'].tolist(), dtype=torch.long)
dev_labels = torch.tensor(df_dev['Sentiment'].tolist(), dtype=torch.long)


In [21]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


In [22]:
train_dataset = TweetDataset(train_data, train_labels)
dev_dataset = TweetDataset(dev_data, dev_labels)


# 7.Set up the training arguments and trainer

In [23]:
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=1), p.label_ids),
        'precision': precision_score(p.predictions.argmax(axis=1), p.label_ids, average='weighted'),
        'recall': recall_score(p.predictions.argmax(axis=1), p.label_ids, average='weighted'),
        'f1': f1_score(p.predictions.argmax(axis=1), p.label_ids, average='weighted'),
    },
)

# 8.Results

In [25]:
trainer.train()

Step,Training Loss
500,1.275
1000,1.1512
1500,1.0699
2000,1.0009
2500,0.9552
3000,0.9207
3500,0.9017
4000,0.868
4500,0.8281
5000,0.81


TrainOutput(global_step=28008, training_loss=0.4730204825242463, metrics={'train_runtime': 3324.9744, 'train_samples_per_second': 67.383, 'train_steps_per_second': 8.424, 'total_flos': 1.4737509516847104e+16, 'train_loss': 0.4730204825242463, 'epoch': 3.0})

In [26]:
results = trainer.evaluate()

In [27]:
print(results)

{'eval_loss': 0.21273788809776306, 'eval_accuracy': 0.965, 'eval_precision': 0.9653957009474403, 'eval_recall': 0.965, 'eval_f1': 0.9650327922077923, 'eval_runtime': 4.1501, 'eval_samples_per_second': 240.959, 'eval_steps_per_second': 30.12, 'epoch': 3.0}


In [28]:
print(results)

{'eval_loss': 0.21273788809776306, 'eval_accuracy': 0.965, 'eval_precision': 0.9653957009474403, 'eval_recall': 0.965, 'eval_f1': 0.9650327922077923, 'eval_runtime': 4.1501, 'eval_samples_per_second': 240.959, 'eval_steps_per_second': 30.12, 'epoch': 3.0}
