In [6]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification,AdamW
import torch
from torch.utils.data import DataLoader
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import tqdm
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
def accuracy(out_logits,labels):
    predicted = out_logits.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    predicted = np.argmax(predicted,axis=1).reshape(labels.shape)
    return np.mean(labels == predicted)

def val_params(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    accuracy_sum =0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss_sum += outputs.loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batches +=1
    return loss_sum/num_batches,accuracy_sum/num_batches

class EmpathyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [36]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
x = pickle.load(open("./data/x_train.p","rb"))
y = pickle.load(open("./data/y_train.p","rb"))

In [11]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
tokenizer.save_pretrained("./model/BERT_empathy_fine_tuned/")

('./model/BERT_empathy_fine_tuned/tokenizer_config.json',
 './model/BERT_empathy_fine_tuned/special_tokens_map.json',
 './model/BERT_empathy_fine_tuned/vocab.txt',
 './model/BERT_empathy_fine_tuned/added_tokens.json')

In [7]:
train_encodings = tokenizer(x_train,truncation=True, padding='longest', return_tensors="pt")
val_encodings = tokenizer(x_val,truncation=True, padding='longest', return_tensors="pt")

In [8]:
train_dataset = EmpathyDataset(train_encodings,y_train)
val_dataset = EmpathyDataset(val_encodings,y_val)

In [19]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

NameError: name 'train_dataset' is not defined

In [20]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
x=model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [21]:
optim = AdamW(model.parameters(), lr=5e-5)


In [22]:
EPOCHS=5
prev_val_acc = -1
for epoch in range(EPOCHS):
    temp = model.train()
    loss_sum = 0
    accuracy_sum = 0
    num_batch = 0
    pbar = tqdm.tqdm(train_loader)
    for batch in pbar:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loss_sum += loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f, Train accuracy: %f"%(epoch,loss_sum/num_batch,accuracy_sum/num_batch))
    
    val_metric = val_params(model,val_loader)
    sys.stdout.write("         Val loss: %f, Val accuracy: %f"%val_metric)
    sys.stdout.flush()
    
    #Breaking criteria
    if prev_val_acc > val_metric[1]:
        break
    
    prev_val_acc = val_metric[1]
    
    #saving model checkpoint
    model.save_pretrained("../model/BERT_empathy_fine_tuned/")
    


Epoch: 0, Train loss: 0.084816, Train accuracy: 0.970274: 100%|██████████| 1046/1046 [05:30<00:00,  3.16it/s]


         Val loss: 0.066248, Val accuracy: 0.978885

Epoch: 1, Train loss: 0.033268, Train accuracy: 0.989543: 100%|██████████| 1046/1046 [05:30<00:00,  3.17it/s]


         Val loss: 0.049287, Val accuracy: 0.983615

Epoch: 2, Train loss: 0.016980, Train accuracy: 0.994772: 100%|██████████| 1046/1046 [05:30<00:00,  3.17it/s]


         Val loss: 0.056510, Val accuracy: 0.980405

In [None]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_val = 

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(x_train,y_train)


In [37]:
model = DistilBertForSequenceClassification.from_pretrained("./model/BERT_empathy_fine_tuned/").to(device)

In [38]:
tokenizer = DistilBertTokenizerFast.from_pretrained("./model/BERT_empathy_fine_tuned/")

In [39]:
x = pickle.load(open("./data/x_test.p","rb"))
y = pickle.load(open("./data/y_test.p","rb"))

In [40]:
encodings = tokenizer(x,truncation=True, padding='longest', return_tensors="pt")
empathy_dataset_test = EmpathyDataset(encodings,y)
test_data_loader = DataLoader(empathy_dataset_test,batch_size=BATCH_SIZE, shuffle=True)
x = model.eval()

In [41]:
x = val_params(model,test_data_loader)

In [42]:
x

(0.05369445616336462, 0.9830069124423964)

In [43]:
print("Test accuracy is ",x[1])

Test accuracy is  0.9830069124423964
