In [203]:
!pip install transformers==3.0.2



In [204]:
!pip install opacus



In [205]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [206]:
# Setting up GPU

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [207]:
df = pd.read_csv('Harassment_Cleaned_tweets.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Key Word,Username,User_ID,Datetime,Favorite_count,Geo,Coordinates,Label,Text,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,704,ass,DeborahParr,1.33e+18,11-11-2020 06:56,0,,,1,He’d have my phone wedged up his ass sideways.,,,
1,1915,boobies,MaxZorin85,1.33e+18,11-11-2020 07:35,4,,,0,Yep 100% agree and the same with severine in s...,,,
2,2856,eat pussy,PRISJ1_,1.33e+18,11-11-2020 10:36,0,,,1,Stop having sex with men that won’t eat your p...,,,
3,2163,Breast Man,Teresamckenzy1,1.33e+18,10-11-2020 20:52,0,,,1,"When you see a sad man, just give him breast t...",,,
4,2852,eat pussy,sj__vazquez,1.33e+18,11-11-2020 10:42,0,,,1,We can't be together if you don't eat pussy,,,


In [208]:
df.describe()

Unnamed: 0.1,Unnamed: 0,User_ID,Favorite_count,Label
count,3604.0,3604.0,3604.0,3604.0
mean,1801.632908,1.329972e+18,1.429245,0.547447
std,1040.729184,3991290000000000.0,10.752237,0.497813
min,0.0,1.32e+18,0.0,0.0
25%,900.75,1.33e+18,0.0,0.0
50%,1801.5,1.33e+18,0.0,1.0
75%,2702.25,1.33e+18,1.0,1.0
max,3604.0,1.35e+18,396.0,1.0


In [209]:
#Preparing Dataset and Dataloader

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
VIRTUAL_BATCH_SIZE = 32
assert VIRTUAL_BATCH_SIZE % TRAIN_BATCH_SIZE == 0 # VIRTUAL_BATCH_SIZE should be divisible by BATCH_SIZE
N_ACCUMULATION_STEPS = int(VIRTUAL_BATCH_SIZE / TRAIN_BATCH_SIZE)
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [210]:
class TweetData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Text
        self.targets = self.data.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [211]:
train_size = 0.8
train_data=df.sample(frac=train_size,random_state=200)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = TweetData(train_data, tokenizer, MAX_LEN)
testing_set = TweetData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (3604, 13)
TRAIN Dataset: (2883, 13)
TEST Dataset: (721, 13)


In [212]:
LOGGING_INTERVAL = 100 # once every how many steps we run evaluation cycle and report metrics
EPSILON = 0.5
DELTA = 1 / len(training_set) # Parameter for privacy accounting. Probability of not achieving privacy guarantees

In [213]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler

SAMPLE_RATE = TRAIN_BATCH_SIZE / len(training_set)

#train_params = {'batch_size': TRAIN_BATCH_SIZE,
#                'shuffle': True,
#                'num_workers': 0
#                }

#test_params = {'batch_size': VALID_BATCH_SIZE,
#                'shuffle': True,
#                'num_workers': 0
#                }

train_sampler=UniformWithReplacementSampler(
    num_samples=len(training_set),
    sample_rate=SAMPLE_RATE,
)

test_sampler = SequentialSampler(testing_set)

training_loader = DataLoader(training_set, batch_sampler=train_sampler)
testing_loader = DataLoader(testing_set, sampler=test_sampler, batch_size=VALID_BATCH_SIZE)

In [214]:
#Base Roberta model
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [215]:
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [216]:
trainable_layers = [model.pre_classifier, model.classifier]
total_params = 0
trainable_params = 0

for p in model.parameters():
        p.requires_grad = False
        total_params += p.numel()

for layer in trainable_layers:
    for p in layer.parameters():
        p.requires_grad = True
        trainable_params += p.numel()

print(f"Total parameters count: {total_params}") # ~125M
print(f"Trainable parameters count: {trainable_params}") # ~0.5M

Total parameters count: 125240069
Trainable parameters count: 594437


In [217]:
#Finetuning Roberta model

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [218]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [219]:
EPOCHS = 1

In [220]:
from opacus import PrivacyEngine

MAX_GRAD_NORM = 0.1

privacy_engine = PrivacyEngine(
    module=model,
    sample_rate=SAMPLE_RATE * N_ACCUMULATION_STEPS,
    target_delta = DELTA,
    target_epsilon = EPSILON, 
    epochs = EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
)
privacy_engine.attach(optimizer)

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


In [221]:
#Testing the trained model

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")

            
        
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu, epoch_loss


In [222]:
# Defining the training function on the 80% of the dataset

def train(epoch, training_loader, testing_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    losses = []
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%2000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        losses.append(loss.item())
        # # When using GPU
        if (_ + 1) % 2000 == 0 or _ == len(training_loader) - 1:
            optimizer.step()
        else:
            optimizer.virtual_step()

        if _ > 0 and _ % 2000 == 0:
              train_loss = np.mean(losses)
              eps, alpha = optimizer.privacy_engine.get_privacy_spent(DELTA)

              eval_accuracy,eval_loss = valid(model, testing_loader)

              print(
                  f"Epoch: {epoch} | "
                  f"Step: {_} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                  f"ɛ: {eps:.2f} (α: {alpha})"
              )

        
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [223]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch, training_loader, testing_loader)





1it [00:00,  7.58it/s][A[A

Training Loss per 5000 steps: 1.5928491353988647
Training Accuracy per 5000 steps: 40.0




2it [00:00,  6.48it/s][A[A

3it [00:00,  6.63it/s][A[A

4it [00:00,  6.56it/s][A[A

6it [00:00,  7.76it/s][A[A

8it [00:00,  8.87it/s][A[A

9it [00:01,  8.47it/s][A[A

10it [00:01,  7.73it/s][A[A

11it [00:01,  7.92it/s][A[A

13it [00:01,  8.21it/s][A[A

14it [00:01,  7.61it/s][A[A

15it [00:01,  6.90it/s][A[A

16it [00:02,  7.31it/s][A[A

17it [00:02,  7.66it/s][A[A

18it [00:02,  7.25it/s][A[A

19it [00:02,  7.13it/s][A[A

20it [00:02,  6.90it/s][A[A

22it [00:02,  7.65it/s][A[A

23it [00:02,  7.25it/s][A[A

24it [00:03,  7.59it/s][A[A

26it [00:03,  8.87it/s][A[A

27it [00:03,  7.44it/s][A[A

29it [00:03,  7.96it/s][A[A

31it [00:03,  8.12it/s][A[A

33it [00:04,  8.16it/s][A[A

34it [00:04,  6.86it/s][A[A

35it [00:04,  7.08it/s][A[A

36it [00:04,  7.24it/s][A[A

38it [00:04,  7.46it/s][A[A

39it [00:04,  7.73it/s][A[A

41it [00:05,  8.45it/s][A[A

42it [00:05,  8.46it/s][A[A

43it [00:05,  8.25it/s][A[A

44it [00:05,

The Total Accuracy for Epoch 0: 33.57868909919383
Training Loss Epoch: 1.5678919629918204
Training Accuracy Epoch: 33.57868909919383





In [224]:
acc, loss = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)



0it [00:00, ?it/s][A[A

2it [00:00, 17.56it/s][A[A

Validation Loss per 100 steps: 1.5637696981430054
Validation Accuracy per 100 steps: 50.0




4it [00:00, 17.01it/s][A[A

6it [00:00, 16.68it/s][A[A

8it [00:00, 16.59it/s][A[A

10it [00:00, 16.54it/s][A[A

12it [00:00, 16.47it/s][A[A

14it [00:00, 16.42it/s][A[A

16it [00:00, 16.47it/s][A[A

18it [00:01, 16.47it/s][A[A

20it [00:01, 16.49it/s][A[A

22it [00:01, 16.48it/s][A[A

24it [00:01, 16.41it/s][A[A

26it [00:01, 16.40it/s][A[A

28it [00:01, 16.43it/s][A[A

30it [00:01, 16.45it/s][A[A

32it [00:01, 16.43it/s][A[A

34it [00:02, 16.36it/s][A[A

36it [00:02, 16.36it/s][A[A

38it [00:02, 16.42it/s][A[A

40it [00:02, 16.41it/s][A[A

42it [00:02, 16.39it/s][A[A

44it [00:02, 16.27it/s][A[A

46it [00:02, 16.22it/s][A[A

48it [00:02, 16.37it/s][A[A

50it [00:03, 16.42it/s][A[A

52it [00:03, 16.46it/s][A[A

54it [00:03, 16.38it/s][A[A

56it [00:03, 16.38it/s][A[A

58it [00:03, 16.43it/s][A[A

60it [00:03, 16.44it/s][A[A

62it [00:03, 16.42it/s][A[A

64it [00:03, 16.37it/s][A[A

66it [00:04, 16.31it/s][A[A

68it [00:

Validation Loss Epoch: 1.5645007448301789
Validation Accuracy Epoch: 52.56588072122053
Accuracy on test data = 52.57%





In [225]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)


('./vocab.json', './merges.txt')