In [1]:
!pip install transformers
!pip install torch

[0m

In [2]:
# importing nessary packages
import numpy as np
import pandas as pd
import re, string
from bs4 import BeautifulSoup
import pickle
import emoji
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

from transformers import AdamW, get_linear_schedule_with_warmup

import tensorflow as tf

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Preprocessing utilities
stop_words = set(stopwords.words('english'))
contraction_map={"ain't":'is not',"aren't":'are not',"can't":'can not','cause':'because',"could've":'could have',"couldn't":'could not',"couldn't've":'could not have',"didn't":'did not',"doesn't":'does not',"don't":'do not',"hadn't":'had not',"hadn't've":'had not have',"hasn't":'has not',"haven't":'have not',"he'd":'he would',"he'd've":'he would have',"he'll":'he will',"he'll've":'he will have',"he's":'he is',"how'd":'how did',"how'd've":'how did have',"how'll":'how will',"how's":'how is',"I'd":'I would',"I'd've":'I would have',"I'll":'I will',"I'll've":'I will have',"I'm":'I am',"I've":'I have',"i'd":'i would',"i'd've":'i would have',"i'll":'i will',"i'll've":'i will have',"i'm":'i am',"i've":'i have',"isn't":'is not',"it'd":'it would',"it'd've":'it would have',"it'll":'it will',"it'll've":'it will have',"it's":'it is',"let's":'let us',"ma'am":'madam',"mayn't":'may not',"might've":'might have',"mightn't":'might not',"mightn't've":'might not have',"must've":'might have',"mustn't":'must not',"mustn't've":'must not have',"needn't":'need not',"needn't've":'need not have',"o'clock":'of the clock',"oughtn't":'ought not',"oughtn't've":'ought not have',"shan't":'shall not',"shall'n't":'shall not',"shan't've":'shall not have',"she'd":'she would',"she'd've":'she would have',"she'll":'she will',"she'll've":'she will have',"she's":'she is',"should've":'should have',"shouldn't":'should not',"shouldn't've":'should not have',"so've":'so have',"so's":'so as',"that'd":'that would',"that'd've":'that would have',"that's":'that is',"there'd":'there would',"there'd've":'there would have',"there's":'there is',"they'd":'they would',"they'd've":'they would have',"they'll":'they will',"they'll've":'they will have',"they're":'they are',"they've":'they have',"to've":'to have',"wasn't":'was not',"we'd":'we would',"we'd've":'we would have',"we'll":'we will',"we'll've":'we will have',"we're":'we are',"weren't":'were not',"what'll":'what will',"what'll've":'what will have',"what're":'what are',"what's":'what is',"what've":'what have',"when's":'when is',"when've":'when have',"where'd":'where did',"where's":'where is',"where've":'where have',"who'll":'who will',"who'll've":'who will have',"who's":'who is',"who've":'who have',"why's":'why is',"why've":'why have',"will've":'will have',"won't":'will not',"will't've":'will not have',"would've":'would have',"would't":'would not',"would't've":'would not have',"y'all":'you all',"y'all'd":'you all would',"y'all'd've":'you all would have',"y'all're":'you all are',"y'all've":'you have all',"you'd":'you would',"you'd've":'you would have',"you'll":'you will',"you'll've":'you will have',"you're":'you are',"you've":'you have'}

def expand_contractions(sent, mapping):
    pattern = re.compile('({})'.format('|'.join(mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_map(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expansion = mapping.get(match) if mapping.get(match) else mapping.get(match.lower())
        expansion = first_char + expansion[1:]
        return expansion
    expand_sent = pattern.sub(expand_map, sent)
    return expand_sent

def split_hashtag(text):
    words = text.split(" ")
    text = ''
    for word in words:
        if(len(word)>0):
            if word[0]=='#':
                tmp=''
                pre=''
                for ch in word:
                    if ch!='#':
                        if (ch<='Z' and ch>='A') and not(pre<='Z' and pre>='A'):
                            tmp+=' '
                        tmp+=ch
                    pre=ch
                word = tmp
        text=text+' '+word
    return text

def replace_emoji(text):
    text = emoji.demojize(text)
    return text

def alpha_only(text):
    tmp=''
    for ch in text:
        if ch>='a' and ch<='z':
            tmp+=ch
        else:
            tmp+=' '
    return tmp

def text_cleaning(text):
    text = BeautifulSoup(text).get_text()
    text = re.sub(r"(?:https?\://)\S+", "", text)
    text=re.sub(r"(?:\@)\S+","user",text)
    text = re.sub('\n','',text)
    text = re.sub('\t','',text)
    text=split_hashtag(text)
    text=replace_emoji(text)
    text=text.lower()
    text = text.replace("’","'")
    text=expand_contractions(text,contraction_map)
    text=alpha_only(text)
    text = ' '.join(text.split())
    return text

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

def df_cleaner(df):
    tqdm.pandas(desc="Basic Text Cleaning: ", postfix=None)
    df['text'] = df['text'].progress_apply(text_cleaning)
    tqdm.pandas(desc="Text Tokenization: ", postfix=None)
    df['text'] = df['text'].progress_apply(lambda x: x.split())
    tqdm.pandas(desc="Removing Stopwords: ", postfix=None)
    df['text'] = df['text'].progress_apply(remove_stopwords)
    tqdm.pandas(desc="Tokens Marging: ", postfix=None)
    df['text'] = df['text'].progress_apply(lambda x: ' '.join(x))
    return df

In [4]:
# Loading Dataset
df = pd.read_csv("/kaggle/input/ukraine-russia-conflict-tweets-small-dataset/label-tweets-final-balanced.csv")
df['label'] = df['label'].map({"positive":1,"negative":0})
#df = df.loc[:500]
df

Unnamed: 0,text,replyCount,retweetCount,likeCount,quoteCount,label
0,Shout out to those brave enough in #Russia to ...,0.0,6.0,6.0,0.0,1
1,"In Moscow Russia, 600 people protesting agains...",1.0,0.0,9.0,0.0,0
2,We attempted to warn the world with this paint...,0.0,1.0,2.0,0.0,0
3,How should we stand up in defense of Ukraine a...,1.0,0.0,0.0,0.0,0
4,Who else is worried about Russia going to war ...,0.0,0.0,4.0,0.0,0
...,...,...,...,...,...,...
291375,October is almost over. What a good month to e...,0.0,0.0,2.0,0.0,1
291376,"idk who hears to this, but I want peace in Ukr...",0.0,0.0,3.0,0.0,1
291377,@ladyasabea Oh wonim template no? He's going t...,1.0,0.0,0.0,1.0,1
291378,Russia is ready to supply the poorest countrie...,4.0,3.0,11.0,0.0,1


In [5]:
# Cleaning Text
df = df_cleaner(df)
df

Basic Text Cleaning: 100%|██████████| 291380/291380 [03:39<00:00, 1327.02it/s]
Text Tokenization: 100%|██████████| 291380/291380 [00:02<00:00, 138020.00it/s]
Removing Stopwords: 100%|██████████| 291380/291380 [00:02<00:00, 99350.53it/s] 
Tokens Marging: 100%|██████████| 291380/291380 [00:00<00:00, 638282.27it/s]


Unnamed: 0,text,replyCount,retweetCount,likeCount,quoteCount,label
0,shout brave enough russia publicly oppose puti...,0.0,6.0,6.0,0.0,1
1,moscow russia people protesting war detained r...,1.0,0.0,9.0,0.0,0
2,attempted warn world painting depicting putin ...,0.0,1.0,2.0,0.0,0
3,stand defense ukraine rights sovereignty freed...,1.0,0.0,0.0,0.0,0
4,else worried russia going war ukraine ok,0.0,0.0,4.0,0.0,0
...,...,...,...,...,...,...
291375,october almost good month end russia ukraine w...,0.0,0.0,2.0,0.0,1
291376,idk hears want peace ukraine think peace mean ...,0.0,0.0,3.0,0.0,1
291377,user oh wonim template going say economy well ...,1.0,0.0,0.0,1.0,1
291378,russia ready supply poorest countries tons gra...,4.0,3.0,11.0,0.0,1


In [6]:
df['text'][11]

'war major european states continuation america cold war russia started ukraine us pawn put play obama engineered replacement neutral government pro west government put nato russian border'

In [7]:
# Creating Text List & Label List
texts = df['text'].to_list()
labels = df['label'].to_list()

In [8]:
# Tokenize Dataset
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',do_lower_case=True)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
# Tokenize the text data
tokenized_texts = [tokenizer.tokenize(text) for text in tqdm(texts, desc="Tokenizing Dataset: ")]

max_length = 128
input_ids = []
for text in tqdm(tokenized_texts, desc="Genarating Input IDs: "):
    ids = tokenizer.convert_tokens_to_ids(text)[:max_length]
    input_ids.append(ids)

padded_input_ids = []
for i in tqdm(range(len(input_ids)), desc="Padding Input IDs: "):
    padded_input_ids.append(input_ids[i] + [0] * (max_length - len(input_ids[i])))

attention_masks = []
for i in tqdm(range(len(input_ids)), desc="Genarating Attention Masks: "):
    mask = [int(token_id > 0) for token_id in padded_input_ids[i]]
    attention_masks.append(mask)

Tokenizing Dataset: 100%|██████████| 291380/291380 [03:34<00:00, 1360.79it/s]
Genarating Input IDs: 100%|██████████| 291380/291380 [00:08<00:00, 33978.97it/s]
Padding Input IDs: 100%|██████████| 291380/291380 [00:01<00:00, 165293.20it/s]
Genarating Attention Masks: 100%|██████████| 291380/291380 [00:08<00:00, 34729.42it/s]


In [10]:
# Split Train, Test and Validation Datasets
input_ids = torch.tensor(padded_input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
print("Spliting Inputs and Outputs...")
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=test_ratio, random_state=42)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, test_size=val_ratio/(train_ratio), random_state=42)
print("Creating Masks...")
train_masks = torch.tensor([[int(token_id > 0) for token_id in sent] for sent in tqdm(train_inputs, desc="Training Mask: ")])
val_masks = torch.tensor([[int(token_id > 0) for token_id in sent] for sent in tqdm(val_inputs, desc="validation Mask: ")])
test_masks = torch.tensor([[int(token_id > 0) for token_id in sent] for sent in tqdm(test_inputs, desc="Testing Mask: ")])
print("Creating Datasets...")
train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
print("Creating Dataloaders...")
batch_size = 32
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

Spliting Inputs and Outputs...
Creating Masks...


Training Mask: 100%|██████████| 229461/229461 [02:09<00:00, 1771.63it/s]
validation Mask: 100%|██████████| 32781/32781 [00:18<00:00, 1751.00it/s]
Testing Mask: 100%|██████████| 29138/29138 [00:15<00:00, 1889.41it/s]


Creating Datasets...
Creating Dataloaders...


In [11]:
# Loading Model, Optimizer, Scheduler, Loss Function
num_classes = 2
num_epochs = 5
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)
loss_fn = nn.CrossEntropyLoss()

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [12]:
# Activate GPU and Move Model and others to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [13]:
# Training Model
def accuracy(predictions, labels):
    _, preds = torch.max(predictions, dim=1)
    accuracy = torch.sum(preds == labels).item() / len(labels)
    return accuracy

model.train() 

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}:")
    total_loss, total_accuracy = 0, 0
    for step, batch in tqdm(enumerate(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs.loss, outputs.logits
        loss.backward()
        total_loss += loss.item()
        total_accuracy += accuracy(logits, b_labels)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = total_accuracy / len(train_dataloader)
    print(f"Finished Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.3f}, Train Accuracy: {avg_train_accuracy:.3f}")

Epoch 1/5:


7171it [39:22,  3.04it/s]


Finished Epoch 1/5, Train Loss: 0.185, Train Accuracy: 0.925
Epoch 2/5:


7171it [39:28,  3.03it/s]


Finished Epoch 2/5, Train Loss: 0.101, Train Accuracy: 0.963
Epoch 3/5:


7171it [39:24,  3.03it/s]


Finished Epoch 3/5, Train Loss: 0.068, Train Accuracy: 0.977
Epoch 4/5:


7171it [39:24,  3.03it/s]


Finished Epoch 4/5, Train Loss: 0.045, Train Accuracy: 0.986
Epoch 5/5:


7171it [39:28,  3.03it/s]

Finished Epoch 5/5, Train Loss: 0.029, Train Accuracy: 0.992





In [14]:
# Exporting Model to Pickle
import pickle 

with open('/kaggle/working/DistilBERT_Model_Final_Balanced_Dataset.pkl', 'wb') as f:
    pickle.dump(model, f)

In [15]:
# Importing Model from Pickle
import pickle

with open('/kaggle/working/DistilBERT_Model_Final_Balanced_Dataset.pkl', 'rb') as f:
    model_test = pickle.load(f)

In [16]:
# Testing Model
def test_model(model, test_dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs[0]
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    report = classification_report(true_labels, predictions,output_dict=True)

    print(report)
    return true_labels, predictions

In [17]:
# Running Test
true_labels, predictions = test_model(model_test, test_dataloader)

{'0': {'precision': 0.9605299098606938, 'recall': 0.9590890495022502, 'f1-score': 0.9598089389286932, 'support': 14666}, '1': {'precision': 0.958603560093832, 'recall': 0.9600608070757325, 'f1-score': 0.959331630187116, 'support': 14472}, 'accuracy': 0.9595716933214359, 'macro avg': {'precision': 0.9595667349772629, 'recall': 0.9595749282889914, 'f1-score': 0.9595702845579046, 'support': 29138}, 'weighted avg': {'precision': 0.95957314776906, 'recall': 0.9595716933214359, 'f1-score': 0.9595718735121201, 'support': 29138}}


In [18]:
with open('/kaggle/working/y_test_DistilBERT.pkl', 'wb') as f:
    pickle.dump(true_labels, f)

In [19]:
with open('/kaggle/working/DistilBERT_pred.pkl', 'wb') as f:
    pickle.dump(predictions, f)