In [1]:
#Install libraries
!pip3 install transformers
!pip3 install torch



In [2]:
!pip install opacus



In [3]:
# Importing libraries
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from tqdm import tqdm


# Use GPU
device = torch.device("cuda")

In [4]:
# Setting up GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
df = pd.read_csv('/content/drive/My Drive/dep1_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,target
0,0,Today in Selfcare: beauty ; laughs Kung Fu Pan...,0
1,1,I get to spend New Year's home again alone and...,1
2,2,"Depressed and lonely /: Stuck in a deep, never...",1
3,3,If this is your response to someone saying the...,0
4,4,Apparently you get a free pass just by mention...,0


In [7]:
#Initialization
MAX_LEN = 512
VIRTUAL_BATCH_SIZE = 32
TRAIN_BATCH_SIZE = 8

assert VIRTUAL_BATCH_SIZE % TRAIN_BATCH_SIZE == 0 # VIRTUAL_BATCH_SIZE should be divisible by BATCH_SIZE
N_ACCUMULATION_STEPS = int(VIRTUAL_BATCH_SIZE / TRAIN_BATCH_SIZE)

EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [8]:
#The class is defined to accept the Dataframe as input and generate tokenized output that is used by the DistilBERT model for training.
#The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids, attention_mask
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.tweet[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.target[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [9]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3096, 3)
TRAIN Dataset: (2477, 3)
TEST Dataset: (619, 3)


In [10]:
SAMPLE_RATE = TRAIN_BATCH_SIZE / len(training_set)
LOGGING_INTERVAL = 100 # once every how many steps we run evaluation cycle and report metrics
EPSILON = 3
DELTA = 1 / len(training_set) # Parameter for privacy accounting. Probability of not achieving privacy guarantees
DELTA

0.0004037141703673799

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler

SAMPLE_RATE = TRAIN_BATCH_SIZE / len(training_set)

#train_params = {'batch_size': TRAIN_BATCH_SIZE,
#                'shuffle': True,
#                'num_workers': 0
#                }

#test_params = {'batch_size': VALID_BATCH_SIZE,
#                'shuffle': True,
#                'num_workers': 0
#                }

train_sampler=UniformWithReplacementSampler(
    num_samples=len(training_set),
    sample_rate=SAMPLE_RATE,
)

test_sampler = SequentialSampler(testing_set)

training_loader = DataLoader(training_set, batch_sampler=train_sampler)
testing_loader = DataLoader(testing_set, sampler=test_sampler, batch_size=TRAIN_BATCH_SIZE//2)

In [12]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [13]:
model = DistillBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [14]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [15]:
# Function to calcuate the accuracy of the model

def calcuate_accuracy(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [16]:
trainable_layers = [model.pre_classifier, model.classifier]
total_params = 0
trainable_params = 0

for p in model.parameters():
        p.requires_grad = False
        total_params += p.numel()

for layer in trainable_layers:
    for p in layer.parameters():
        p.requires_grad = True
        trainable_params += p.numel()

print(f"Total parameters count: {total_params}") # ~66M
print(f"Trainable parameters count: {trainable_params}") # ~0.5M

Total parameters count: 66956548
Trainable parameters count: 593668


In [17]:
from opacus import PrivacyEngine

MAX_GRAD_NORM = 0.1

privacy_engine = PrivacyEngine(
    module=model,
    sample_rate=SAMPLE_RATE * N_ACCUMULATION_STEPS,
    target_delta = DELTA,
    target_epsilon = EPSILON, 
    epochs = EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
)
privacy_engine.attach(optimizer)

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


In [18]:
#Testing the trained model

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _% 1000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 5000 steps: {loss_step}")
                print(f"Validation Accuracy per 5000 steps: {accu_step}")

            
        
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu, epoch_loss


In [19]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch, training_loader, testing_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    losses=[]
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _% 5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        losses.append(loss.item())
        
        
        if (_ + 1) % 1000 == 0 or _ == len(training_loader) - 1:
            optimizer.step()
        else:
            optimizer.virtual_step()

        if _ > 0 and _ % 1000 == 0:
              train_loss = np.mean(losses)
              eps, alpha = optimizer.privacy_engine.get_privacy_spent(DELTA)

              eval_accuracy,eval_loss = valid(model, testing_loader)

              print(
                  f"Epoch: {epoch} | "
                  f"Step: {_} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                  f"ɛ: {eps:.2f} (α: {alpha})"
              )

        
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [20]:

for epoch in range(EPOCHS):
    train(epoch, training_loader, testing_loader)



Training Loss per 5000 steps: 1.4406931400299072
Training Accuracy per 5000 steps: 40.0
The Total Accuracy for Epoch 0: 24.25357873210634
Training Loss Epoch: 1.4761463943808595
Training Accuracy Epoch: 24.25357873210634
Training Loss per 5000 steps: 1.5060145854949951
Training Accuracy per 5000 steps: 20.0
The Total Accuracy for Epoch 1: 22.293762575452718
Training Loss Epoch: 1.4808969655854802
Training Accuracy Epoch: 22.293762575452718
Training Loss per 5000 steps: 1.5409574508666992
Training Accuracy per 5000 steps: 8.333333333333334
The Total Accuracy for Epoch 2: 25.097580015612802
Training Loss Epoch: 1.4746163512510775
Training Accuracy Epoch: 25.097580015612802
Training Loss per 5000 steps: 1.4436732530593872
Training Accuracy per 5000 steps: 33.333333333333336
The Total Accuracy for Epoch 3: 24.459567654123298
Training Loss Epoch: 1.4732252914928696
Training Accuracy Epoch: 24.459567654123298
Training Loss per 5000 steps: 1.4949116706848145
Training Accuracy per 5000 steps: 

In [21]:
acc, loss = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00, 13.96it/s]

Validation Loss per 5000 steps: 1.4742932319641113
Validation Accuracy per 5000 steps: 25.0


155it [00:12, 12.49it/s]

Validation Loss Epoch: 1.4740793528095368
Validation Accuracy Epoch: 28.594507269789982
Accuracy on test data = 28.59%



