In [1]:
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

import sys
from IPython.core import ultratb
sys.excepthook = ultratb.FormattedTB(color_scheme='Linux', call_pdb=False)

tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')

  torch.utils._pytree._register_pytree_node(


In [2]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, X, Y):

        self.labels = np.array(Y)
        self.texts = [tokenizer(text, 
                                padding='max_length', max_length = 512,
                                # truncation=True,
                                return_tensors="pt") for text in X]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [3]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(128, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [4]:
def train(model, X,Y, learning_rate, epochs, batch_size):

    train = Dataset(X,Y)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)

    device = torch.device("cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

        train_label = train_label.to(device)
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)
      
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
      
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
    
    
      print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(X): .3f} | Train Accuracy: {total_acc_train / len(X): .3f}')


def evaluate(model, X,Y, batch_size):

    test = Dataset(X, Y)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size,shuffle=False)

    device = torch.device("cpu")
    y_pred = []
    
    with torch.no_grad():

    for test_input, test_label in test_dataloader:

        test_label = test_label.to(device)
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)
        y_pred.append(output.argmax(dim=1))

    return y_pred

In [5]:
df = pd.read_json("./Sarcasm_Headlines_Dataset_v2.json", lines=True)

X = df['headline']
Y = df['is_sarcastic']

data_split = int(df.shape[0] * 0.75)
X_train, X_test = X[:data_split], X[data_split:]
y_train, y_test = Y[:data_split], Y[data_split:]
df.head()

In [6]:
EPOCHS = 1 #5
batch_size = 32
model = BertClassifier()
LR = 1e-4
              
train(model, X_train, y_train, LR, EPOCHS, batch_size)

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
100%|████████████████████████████████████████████████████████████████████████████████| 671/671 [09:32<00:00,  1.17it/s]

Epochs: 1 | Train Loss:  0.015 | Train Accuracy:  0.762





In [7]:
y_pred = evaluate(model, X_test,y_test, batch_size)

y_pred_ = torch.cat(y_pred, dim=0)

y_pred_=y_pred_.cpu().detach().numpy()

print(classification_report(y_test.values, y_pred_))
print(roc_auc_score(y_test, y_pred_))

# ummm so this is from https://github.com/nguyenduchuyvn/Udacity-Data-Scientist-Nanodegree/tree/main/MyCapstoneProject
# :3 :3

              precision    recall  f1-score   support

           0       0.87      0.88      0.87      3745
           1       0.86      0.86      0.86      3410

    accuracy                           0.87      7155
   macro avg       0.87      0.87      0.87      7155
weighted avg       0.87      0.87      0.87      7155

0.8671640388553262


In [31]:
def mypredict(x):
    device = torch.device("cpu")
    
    with torch.no_grad():
        texts = tokenizer(x, padding='max_length', max_length = 512, return_tensors="pt")
        mask = texts['attention_mask'].to(device)
        input_id = texts['input_ids'].squeeze(1).to(device) # squueze remove all dimensions size 1
        output = model(input_id, mask)
        
        return output
mypredict('You are so pretty.')

tensor([[4.0045, 0.0000]])

In [65]:
def single_train(x):
    device = torch.device("cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=1e-4)
    
    texts = tokenizer(x, padding='max_length', max_length = 512, return_tensors="pt")
    mask = texts['attention_mask'].to(device)
    input_id = texts['input_ids'].squeeze(1).to(device) # squueze remove all dimensions size 1
    
    output = model(input_id, mask)
    print(type(output), output, output.grad_fn)

    batch_loss = criterion(output, torch.tensor([0]))
    print(type(batch_loss), batch_loss, batch_loss.grad_fn)
    
    model.zero_grad()
    batch_loss.backward()
    optimizer.step()

    return output
single_train('You are so pretty.')

<class 'torch.Tensor'> tensor([[3.8836, 0.0000]], grad_fn=<ReluBackward0>) <ReluBackward0 object at 0x0000023D4995FEE0>
<class 'torch.Tensor'> tensor(0.0204, grad_fn=<NllLossBackward0>) <NllLossBackward0 object at 0x0000023D4995FEE0>


tensor([[3.8836, 0.0000]], grad_fn=<ReluBackward0>)

In [66]:
torch.save(model.state_dict(), "sarcasm.model")

In [71]:
model = BertClassifier().to(torch.device("cpu"))
model.load_state_dict(torch.load("sarcasm.model"))

<All keys matched successfully>