In [62]:
!pip install transformers
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [63]:
import gc

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch import nn

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, ElectraTokenizer, ElectraModel

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score , confusion_matrix

RANDOM_SEED=30
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if(torch.cuda.is_available()):
    torch.cuda.empty_cache()
    gc.collect()


In [64]:
BATCH_SIZE = 16
MAX_LEN=256
PRE_TRAINED_MODEL_NAME = 'google/electra-small-generator'

In [65]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
path = 'drive/MyDrive/report_material/'

In [67]:
df_test = pd.read_csv('drive/MyDrive/NLP_Project/test_without_stopwords.csv')

In [68]:
def create_dataframes(stopwords=False, oversampling=0, augmented=False):
    if(stopwords and augmented):
        df_train=pd.read_csv(path+'train_augmented_with_stopwords.csv')
        df_val=pd.read_csv(path+'val_with_stopwords.csv')
    elif(stopwords and (not augmented)):
        df_train=pd.read_csv(path+'train_new_with_stopwords.csv')
        df_val=pd.read_csv(path+'val_with_stopwords.csv')
        
    elif((not stopwords) and augmented):
        df_train=pd.read_csv(path+'train_augmented_without_stopwords.csv')
        df_val=pd.read_csv(path+'val_without_stopwords.csv')
    
    elif((not stopwords) and (not augmented)):
        df_train=pd.read_csv(path+'train_new_without_stopwords.csv')
        df_val=pd.read_csv(path+'val_without_stopwords.csv')
        
    df_train_0=df_train.loc[df_train['label']==0]
    for i in range(oversampling+1):
        df_train=df_train.append(df_train_0, ignore_index=True)
        
    return df_train, df_val


df_train, df_val=create_dataframes(oversampling =1)
print(df_train.shape)
print(df_val.shape)
print(df_train.columns)
print(df_train['label'].value_counts())
print(df_val['label'].value_counts())


(17630, 3)
(3966, 3)
Index(['label', 'tweet', 'index'], dtype='object')
1    12318
2     2657
0     2655
Name: label, dtype: int64
1    3080
2     634
0     252
Name: label, dtype: int64


In [69]:
tokenizer = ElectraTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = ElectraModel.from_pretrained(PRE_TRAINED_MODEL_NAME, output_hidden_states=True)

Some weights of the model checkpoint at google/electra-small-generator were not used when initializing ElectraModel: ['generator_lm_head.bias', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.weight', 'generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [70]:
class TweetDataset(Dataset):

    def __init__(self, tweets, labels, tokenizer, max_len, transform = None):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform
  
    def __len__(self):
        return len(self.tweets)
  
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation = True,
        )
        sample = {
        'text': tweet,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
        }
        if(self.transform):
            sample = self.transform(sample)
        return sample
        
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetDataset(
    tweets=df['tweet'].to_numpy(),
    labels=df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
    )

    return DataLoader(
    ds,
    batch_size=batch_size,
    sampler=RandomSampler(ds),
    num_workers=8
    )
    

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)


class TestDataset(Dataset):

    def __init__(self, tweets, index, tokenizer, max_len, transform = None):
        self.tweets = tweets
        self.index = index
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform
  
    def __len__(self):
        return len(self.tweets)
  
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        index = self.index[item]
        encoding = self.tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation = True,
        )
        sample = {
        'text': tweet,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'index': index
        }
        if(self.transform):
            sample = self.transform(sample)
        return sample

def create_test_loader(df, tokenizer, max_len, batch_size):
    ds = TestDataset(
    tweets=df['tweet'].to_numpy(),
    index=df['index'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
    )

    return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=8
    )
    
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

test_data_loader = create_test_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# data=next(iter(train_data_loader))

In [71]:
class HateSpeechClassifier(nn.Module):

    def __init__(self, n_classes, lm):
        super(HateSpeechClassifier, self).__init__()
        self.lm = lm
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Sequential(
            # nn.Linear(self.lm.config.hidden_size, 512),
            # nn.Tanh(),
            nn.Linear(self.lm.config.hidden_size, 256), 
            nn.Tanh(), 
            nn.Linear(256, 64), 
            nn.Tanh(),
            nn.Linear(64, n_classes),
            # nn.Softmax(n_classes),
        )
  
    def forward(self, input_ids, attention_mask):
        bert_output = self.lm(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        last_hidden_state = bert_output[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        # output = self.drop(bert_output[1])
        return self.out(mean_last_hidden_state)

In [72]:
# sample_dataset=TweetDataset(df_train['tweet'].to_numpy(), df_train['label'].to_numpy(), tokenizer, 3)
# sample=sample_dataset.__getitem__(0)
# print(sample)
# input_ids=sample['input_ids'].to(device)
# attention_mask=sample['attention_mask'].to(device)
# output=bert_model(input_ids=input_ids, attention_mask=attention_mask)
# print(output)

In [73]:
model = HateSpeechClassifier(n_classes=3, lm = bert_model)
model = model.to(device)

In [74]:
for param in bert_model.base_model.parameters():
    print(param.requires_grad)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [75]:
EPOCHS = 10

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, eps=1e-8)
total_steps = len(train_data_loader)*EPOCHS


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

loss_func = nn.CrossEntropyLoss().to(device)

In [76]:
def train_epoch(
    model,
    data_loader, 
    loss_func, 
    optimizer, 
    device, 
    scheduler,
    n_examples
):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)
        
        output = model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        _, prediction = torch.max(output, dim=1)
        loss = loss_func(output, labels)
        
        correct_predictions += torch.sum(prediction == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)


def val_epoch(
    model,
    data_loader, 
    device,
):
    p=[]
    y_true=[]
    model.eval()
    with torch.no_grad():
        for d in data_loader:
            output = model(
                input_ids=d['input_ids'].to(device),
                attention_mask=d['attention_mask'].to(device),
            )
            y_true+=(d['label'].tolist())
            _, prediction=torch.max(output, dim=1)
            p+=prediction.tolist()
        
        
    prediction=np.array(p)
    y_true=np.array(y_true)
    
    accuracy=accuracy_score(y_true, prediction)
    macro_f1_score=f1_score(y_true, prediction, average='macro')
    confusion_matrix_calc = confusion_matrix(y_true, prediction)
    
    return accuracy, macro_f1_score , confusion_matrix_calc


In [None]:
best_epoch=0
best_score=0
best_confusion_matrix = []
best_accuracy=0
for i in range(EPOCHS):
    print("Epoch: ", i)
    train_accuracy, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_func,
        optimizer,
        device,
        scheduler, 
        len(df_train)
    )
    # print("Accuracy: "+ str(train_accuracy.item()) + "\tLoss: "+str(train_loss))
    
    val_accuracy, val_f1_score , val_confusionmatrix =val_epoch(model=model, data_loader=val_data_loader, device=device)
    print("Val accuracy: "+str(val_accuracy)+"\tVal f1_score: "+str(val_f1_score))
    print("Confsuion Matrix" , val_confusionmatrix)
    if(val_f1_score>best_score):
        best_epoch=i
        best_score=val_f1_score
        best_accuracy=val_accuracy
        best_confusion_matrix = val_confusionmatrix
        torch.save(model.state_dict(), './electra-oversampled.pt')
    
print("Ideal number of epochs: ", best_epoch+1)
print("BEST Confsuion Matrix" , best_confusion_matrix)

best_epoch+=1


Epoch:  0


In [None]:
import seaborn as sns

In [None]:
print("Ideal number of epochs: ", best_epoch)
print("Best Score " , best_score)
print("Best accuracy " , best_accuracy )
print("Best Confsuion Matrix : \n " , best_confusion_matrix)
sns.heatmap(best_confusion_matrix, annot=True, cmap='crest')


In [None]:
model = HateSpeechClassifier(n_classes=3, lm = bert_model)
model.load_state_dict(torch.load('./electra-oversampled.pt'))
model = model.to(device)

In [None]:
p=[]
index_list=[]
model.eval()
for d in test_data_loader:
    
    output = model(
        input_ids=d['input_ids'].to(device),
        attention_mask=d['attention_mask'].to(device),
    )
    index_list+=(d['index'].tolist())
    _, prediction=torch.max(output, dim=1)
    
    p+=prediction.tolist()

In [None]:
output_df=pd.DataFrame(columns=['label', 'id'])
output_df['label']=p
output_df['id']=index_list

output_df.sort_values(by=['id'])

In [None]:
output_df.to_csv(path+"outputs-electra.csv", index=False, encoding='utf-8-sig')

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
bert_model = ElectraModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = HateSpeechClassifier(n_classes=3, lm = bert_model)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_data_loader)*3

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

loss_func = nn.CrossEntropyLoss().to(device)

In [None]:
for i in range(4):
    train_accuracy, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_func,
        optimizer,
        device,
        scheduler, 
        len(df_train)
    )
    filename='./bert-augmented-fnn-2-'+str(i)+'.pt'
    torch.save(model.state_dict(), filename)
    print(i)

In [None]:
for i in range(4):
    filename='./bert-augmented-fnn-2-'+str(i)
    temp = HateSpeechClassifier(n_classes=3, lm = bert_model)
    temp.load_state_dict(torch.load(filename+'.pt'))
    temp = temp.to(device)
    
    p=[]
    index_list=[]
    temp.eval()
    for d in test_data_loader:
        
        output = temp(
            input_ids=d['input_ids'].to(device),
            attention_mask=d['attention_mask'].to(device),
        )
        index_list+=(d['index'].tolist())
        _, prediction=torch.max(output, dim=1)
        
        p+=prediction.tolist()
    output_df=pd.DataFrame(columns=['label', 'id'])
    output_df['label']=p
    output_df['id']=index_list

    output_df.sort_values(by=['id'])
    
    output_df.to_csv(filename+'.csv', index=False)
    print(i)

In [None]:
output_df=pd.DataFrame(columns=['label', 'id'])
output_df['label']=p
output_df['id']=index_list

output_df.sort_values(by=['id'])