In [34]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [35]:
df = pd.read_csv('/app/data/gutenberg_paragraphs.csv')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

#count paragraphs per author
df.groupby('Author').count()

#select only X paragraphs per author
df = df.groupby('Author').head(20).reset_index(drop=True)



In [64]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

authors = df['Author'].unique()
author2idx = {author: idx for idx, author in enumerate(authors)}
idx2author = {idx: author for idx, author in enumerate(authors)}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        
        self.labels = [author2idx[author] for author in df['Author']]
        print(self.labels)
        self.texts = [ tokenizer(paragraph, padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for paragraph in df['Paragraph']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [65]:
#https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
class BertClassifier(nn.Module):

    def __init__(self):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.linear = nn.Linear(768, len(authors))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        linear_output = self.linear(pooled_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [38]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    #use_cuda = False
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [39]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    #use_cuda = False
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [40]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

480 60 60


In [41]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing BertModel: ['distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.4.output_layer_norm.bias', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.5.ffn.lin2.weight', 'distilbert.transformer.layer.4.sa_layer_norm.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.5.sa_layer_norm.bias', 'd

[2, 0, 1, 3, 2, 0, 1, 4, 2, 0, 3, 2, 5, 1, 4, 0, 1, 0, 4, 0, 2, 1, 3, 1, 4, 3, 4, 0, 1, 2, 1, 2, 5, 3, 0, 4, 3, 1, 1, 4, 5, 5, 1, 1, 0, 0, 2, 5, 0, 1, 3, 0, 4, 1, 1, 0, 3, 0, 3, 4, 5, 2, 0, 3, 3, 2, 1, 1, 0, 0, 4, 1, 5, 2, 4, 1, 0, 5, 1, 4, 5, 5, 1, 2, 5, 0, 1, 5, 0, 1, 5, 3, 5, 5, 0, 0, 5, 4, 2, 5, 0, 2, 1, 5, 0, 2, 5, 0, 1, 1, 0, 2, 1, 1, 3, 4, 2, 3, 3, 4, 5, 0, 5, 3, 4, 5, 1, 4, 1, 2, 3, 1, 5, 5, 2, 5, 5, 2, 4, 5, 2, 0, 2, 0, 4, 4, 5, 5, 4, 0, 1, 3, 5, 5, 4, 3, 4, 2, 4, 5, 2, 0, 3, 0, 0, 1, 5, 0, 1, 5, 1, 4, 3, 5, 4, 3, 4, 4, 5, 2, 4, 2, 1, 3, 2, 5, 1, 2, 3, 0, 2, 3, 5, 1, 2, 2, 4, 5, 3, 0, 0, 5, 3, 3, 2, 2, 2, 4, 2, 2, 3, 3, 3, 4, 0, 1, 0, 3, 4, 4, 2, 2, 2, 4, 3, 3, 4, 5, 4, 0, 2, 3, 5, 4, 3, 1, 0, 4, 1, 5, 1, 5, 5, 3, 4, 3, 3, 4, 0, 2, 3, 2, 0, 1, 2, 0, 5, 0, 4, 2, 2, 5, 1, 2, 3, 3, 1, 2, 3, 2, 0, 5, 2, 5, 2, 1, 1, 0, 1, 1, 3, 0, 4, 3, 0, 3, 0, 1, 0, 2, 1, 4, 2, 4, 5, 3, 1, 5, 3, 1, 0, 3, 1, 3, 2, 2, 2, 0, 3, 2, 5, 3, 2, 1, 2, 5, 2, 2, 4, 1, 2, 2, 5, 2, 0, 4, 5, 0, 2, 4, 0, 4, 0, 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:19<00:00, 12.23it/s]


Epochs: 1 | Train Loss:  0.898 | Train Accuracy:  0.160 | Val Loss:  0.896 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:19<00:00, 12.11it/s]


Epochs: 2 | Train Loss:  0.896 | Train Accuracy:  0.169 | Val Loss:  0.896 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:19<00:00, 12.03it/s]


Epochs: 3 | Train Loss:  0.896 | Train Accuracy:  0.169 | Val Loss:  0.896 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:20<00:00, 11.99it/s]


Epochs: 4 | Train Loss:  0.896 | Train Accuracy:  0.169 | Val Loss:  0.896 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:20<00:00, 11.97it/s]


Epochs: 5 | Train Loss:  0.896 | Train Accuracy:  0.169 | Val Loss:  0.896 | Val Accuracy:  0.133


In [42]:
evaluate(model, df_test)

[1, 4, 2, 1, 1, 4, 5, 4, 3, 3, 2, 0, 4, 0, 0, 0, 1, 0, 0, 4, 0, 1, 3, 1, 1, 1, 1, 1, 4, 1, 0, 1, 1, 4, 2, 5, 3, 3, 5, 4, 0, 2, 3, 3, 5, 5, 5, 5, 2, 4, 0, 3, 3, 4, 0, 3, 5, 2, 4, 2]
Test Accuracy:  0.183


In [45]:
from matplotlib import pyplot as plt
#plot confusion matrix function
def plot_confusion_matrix(matrix, labels):

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

    ax.set_xticklabels([''] + labels, rotation=90)
    ax.set_yticklabels([''] + labels)

    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [62]:
#plot confusion matrix
test = Dataset(df_test)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

device = torch.device("cuda")
#model = model.cuda()
y_pred = []
y_true = []

with torch.inference_mode():            
        for test_input, test_label in test_dataloader:
    
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
    
            output = model(input_id, mask)
            print(output)
            y_pred.extend(output.argmax(dim=1).tolist())
            y_true.extend(test_label.tolist())

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(cm, authors)


[1, 4, 2, 1, 1, 4, 5, 4, 3, 3, 2, 0, 4, 0, 0, 0, 1, 0, 0, 4, 0, 1, 3, 1, 1, 1, 1, 1, 4, 1, 0, 1, 1, 4, 2, 5, 3, 3, 5, 4, 0, 2, 3, 3, 5, 5, 5, 5, 2, 4, 0, 3, 3, 4, 0, 3, 5, 2, 4, 2]


TypeError: 'collections.OrderedDict' object is not callable

In [43]:
#save model
torch.save(model.state_dict(), 'model.pt')

In [44]:
text = '''      It can be imagined that my close intimacy with Sherlock Holmes
      had interested me deeply in crime, and that after his
      disappearance I never failed to read with care the various
      problems which came before the public. And I even attempted, more
      than once, for my own private satisfaction, to employ his methods
      in their solution, though with indifferent success. There was
      none, however, which appealed to me like this tragedy of Ronald
      Adair. As I read the evidence at the inquest, which led up to a
      verdict of willful murder against some person or persons unknown,
      I realized more clearly than I had ever done the loss which the
      community had sustained by the death of Sherlock Holmes. There
      were points about this strange business which would, I was sure,
      have specially appealed to him, and the efforts of the police
      would have been supplemented, or more probably anticipated, by
      the trained observation and the alert mind of the first criminal
      agent in Europe. All day, as I drove upon my round, I turned over
      the case in my mind and found no explanation which appeared to me
      to be adequate. At the risk of telling a twice-told tale, I will
      recapitulate the facts as they were known to the public at the
      conclusion of the inquest.'''
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
input_id = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['input_ids']
mask = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['attention_mask']

#to cuda
input_id = input_id.cuda()
mask = mask.cuda()

output = model.forward(input_id, mask)

idx2author[output.argmax(dim=1).item()]


'Austen, Jane'