In [1]:
import logging
from pathlib import Path
from typing import List, Mapping, Tuple

import pandas as pd
import torch as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModel
from catalyst.utils import set_global_seed
from data import TextClassificationDataset
#sklearn -> train_test_split 
from sklearn.model_selection import train_test_split

In [2]:
MODEL_NAME = 'distilbert-base-uncased'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
input_text = 'MOTION for Protective Order Re HIPAA by Defendants Marvin Powers'

In [6]:
output_dict = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True, #adding special tokens like MASK, CLS, PAD
    padding="max_length", #will take a text and enlarge it to length of 16
    max_length=16,
    return_tensors="pt",
    truncation=True,#long texts will be cut to 16 words or technically wordpiece tokens 
    return_attention_mask=True,
)

In [7]:
output_dict
#input_ids are our words encoded into numbers and #attention mask is just where 1's are our words and the 0's are padding 

{'input_ids': tensor([[  101,  4367,  2005,  9474,  2344,  2128,  5099, 11057,  2011, 16362,
         13748,  4204,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [8]:
voc = tokenizer.get_vocab()

In [9]:
voc.get('yo')

10930

In [10]:
len(voc)

30522

In [11]:
inv_vocab = {v:k for (k, v) in voc.items()}

In [12]:
[inv_vocab[i] for i in output_dict['input_ids'].tolist()[0]] 

['[CLS]',
 'motion',
 'for',
 'protective',
 'order',
 're',
 'hip',
 '##aa',
 'by',
 'defendants',
 'marvin',
 'powers',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Statistics for DataSet and Data Preparation
Kaggle Notebook example
https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

In [2]:
train_df = pd.read_csv('../data/motions_laws/nwu_docket_entries.csv')
train_df.head()

Unnamed: 0,legacyid,date,number,description
0,IL-CDCT1:15CV01001,03/10/2015,4.0,NOTICE of Appearance of Attorney by Joseph S T...
1,IL-CDCT1:15CV01001,04/23/2015,9.0,NOTICE of Appearance of Attorney by Christophe...
2,IL-CDCT1:15CV01001,01/04/2016,11.0,Joint MOTION for Extension of Time to Complete...
3,IL-CDCT1:15CV01001,01/11/2016,,TEXT ORDER granting <gil>11</gil> Motion for...
4,IL-CDCT1:15CV01001,04/22/2016,,ORAL MOTION by Attorney Busey for continuance ...


In [14]:
df = train_df.groupby('legacyid')
df.head()

Unnamed: 0,legacyid,date,number,description
0,IL-CDCT1:15CV01001,03/10/2015,4.0,NOTICE of Appearance of Attorney by Joseph S T...
1,IL-CDCT1:15CV01001,04/23/2015,9.0,NOTICE of Appearance of Attorney by Christophe...
2,IL-CDCT1:15CV01001,01/04/2016,11.0,Joint MOTION for Extension of Time to Complete...
3,IL-CDCT1:15CV01001,01/11/2016,,TEXT ORDER granting <gil>11</gil> Motion for...
4,IL-CDCT1:15CV01001,04/22/2016,,ORAL MOTION by Attorney Busey for continuance ...
...,...,...,...,...
637599,WI-WDCT3:19CV01066,01/08/2020,3.0,Motion to Admit Bradley Bodiford Pro Hac Vice....
637600,WI-WDCT3:19CV01066,01/08/2020,4.0,** TEXT ONLY ORDER ** ORDER granting <gil>3</...
637601,WI-WDCT3:19CV01066,01/28/2020,7.0,Motion for Extension of Time by Plaintiff Step...
637602,WI-WDCT3:19CV01066,01/31/2020,8.0,ORDER denying as premature <gil>7</gil> Moti...


In [15]:
series = train_df['legacyid'].value_counts()
series['IL-CDCT1:15CV01001']

23

In [16]:
len(train_df['number'].value_counts())

999

In [17]:
train_df['description'].apply(lambda s: len(s.split())).describe()

count    637626.000000
mean         50.278823
std          47.947418
min           3.000000
25%          23.000000
50%          35.000000
75%          59.000000
max        1553.000000
Name: description, dtype: float64

In [18]:
#another function we can use to split dataset
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [4]:
train, remaining = train_test_split(train_df, train_size=.8)
valid, test = train_test_split(remaining, test_size=0.5)

In [5]:
print(train.shape)
print(valid.shape)
print(test.shape)
assert train.shape[0] + valid.shape[0] + test.shape[0] == len(train_df)

(510100, 4)
(63763, 4)
(63763, 4)


In [7]:
train_file = Path('../data/motions_laws/train.csv')
valid_file = Path('../data/motions_laws/valid.csv')
test_file = Path('../data/motions_laws/test.csv')
if not train_file.is_file(): 
    train.to_csv('../data/motions_laws/train.csv', index=False)
if not valid_file.is_file():
    valid.to_csv('../data/motions_laws/valid.csv', index=False)
if not test_file.is_file():
    test.to_csv('../data/motions_laws/test.csv', index=False)

In [22]:
SEQ_LENGTH = 45
train_dataset = TextClassificationDataset(
    texts=train['description'].values.tolist(),
    labels=train['number'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TRAIN set:', len(train_dataset))

valid_dataset = TextClassificationDataset(
    texts=valid['description'].values.tolist(),
    labels=valid['number'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of VALID set:', len(valid_dataset))

test_dataset = TextClassificationDataset(
    texts=test['description'].values.tolist(),
    labels=test['number'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TEST set:', len(test_dataset))

length of TRAIN set: 510100
length of VALID set: 63763
length of TEST set: 63763


In [23]:
train_dataset[17]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'features': tensor([  101,  5448,  1998,  2344, 15080,  1026, 13097,  1028,  2260,  1026,
         1013, 13097,  1028,  4367,  2000, 19776,  6406,  2011,  6465,  2103,
         8329,  2188,  1004, 24497,  1012,  2023,  2553,  2003,  7219,  1012,
         2772,  2011,  3648,  2520,  1039,  3389,  2006,  2184,  1013,  2570,
         1013,  2760,  1012,  1006,   102])}

In [24]:
train_val_loaders = {
    "train": DataLoader(
        dataset=train_dataset,
        batch_size=16,
        shuffle=True,
    ),
    "valid": DataLoader(
        dataset=valid_dataset,
        batch_size=16,
        shuffle=False,
    ),
}

In [25]:
mini_batch = next(iter(train_val_loaders['train']))

In [26]:
mini_batch.keys()

dict_keys(['attention_mask', 'features'])

In [27]:
mini_batch['features'].size()

torch.Size([16, 45])

## Creating the Model

In [52]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel
from model import BertForSequenceClassification

In [29]:
MODEL_NAME

'distilbert-base-uncased'

In [30]:
config = AutoConfig.from_pretrained(
    MODEL_NAME, num_labels=10
)

In [31]:
model = AutoModel.from_pretrained(MODEL_NAME, config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
bert_output = model(
                input_ids = mini_batch['features'],
                attention_mask=mini_batch['attention_mask']
)

In [33]:
type(bert_output), len(bert_output)

(transformers.modeling_outputs.BaseModelOutput, 1)

In [34]:
bert_output[0].size() #(batch size, sequence_length, embedding space)

torch.Size([16, 45, 768])

In [35]:
seq_output = bert_output[0]  # (bs, seq_len, dim)


In [36]:
pooled_output = seq_output.mean(axis=1)
pooled_output.size()

torch.Size([16, 768])

In [37]:
classifier = nn.Linear(768, 10)
dropout = nn.Dropout(0.3)

In [38]:
pooled_output = dropout(pooled_output)  # (bs, dim)
scores = classifier(pooled_output)  # (bs, num_classes)
scores.size()

torch.Size([16, 10])

In [53]:
clf_model = BertForSequenceClassification(pretrained_model_name=MODEL_NAME, num_classes=10)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
clf_model(features=mini_batch['features'], attention_mask=mini_batch['attention_mask'])

tensor([[ 1.6032e-01, -1.1718e-01, -1.4904e-01,  1.4852e-01,  1.8016e-01,
         -5.4257e-02,  2.8516e-01,  1.4599e-01,  7.4595e-02,  1.7470e-01],
        [ 1.0880e-01, -1.2553e-01, -3.3885e-01,  1.6453e-01,  2.6162e-01,
          7.0233e-03,  2.3627e-01,  1.8044e-01, -3.2204e-02,  1.4183e-01],
        [-7.3531e-02, -1.0780e-01, -2.9472e-01,  2.0896e-01,  1.6161e-01,
          8.3599e-02,  9.9036e-02,  1.3906e-01,  1.0407e-02,  2.6947e-01],
        [ 1.9877e-01,  1.8534e-01, -2.0612e-01,  1.9394e-01,  3.1526e-01,
         -1.6177e-02,  2.7526e-01, -3.2886e-02, -3.6064e-02, -1.8960e-01],
        [ 1.2037e-01, -1.3365e-02, -3.4518e-01,  3.5638e-01,  1.3495e-01,
          4.0916e-01, -5.6174e-02,  2.0182e-01, -1.9149e-01,  2.1941e-01],
        [-2.3680e-02, -1.0082e-01, -3.1066e-01,  9.5222e-02,  9.9221e-02,
         -2.2106e-02,  1.0827e-01,  8.0604e-02, -9.4855e-02, -1.0121e-02],
        [ 2.5828e-02, -9.2505e-02, -1.8566e-01,  1.5773e-01,  2.1482e-01,
         -1.1149e-01,  2.0896e-0

## Training

In [42]:
import torch
import yaml
import transformers
import catalyst
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
    AccuracyCallback,
    CheckpointCallback,
    InferCallback,
    OptimizerCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed

In [43]:
torch.__version__

'1.10.2+cpu'

In [44]:
transformers.__version__

'4.16.2'

In [45]:
catalyst.__version__

'20.08.2'

In [46]:
# specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=3e-5
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [47]:
set_global_seed(17)
prepare_cudnn(deterministic=True)

In [58]:
SEQ_LENGTH = 45
train_dataset = TextClassificationDataset(
    texts=train['description'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TRAIN set:', len(train_dataset))

valid_dataset = TextClassificationDataset(
    texts=valid['description'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of VALID set:', len(valid_dataset))

test_dataset = TextClassificationDataset(
    texts=test['description'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TEST set:', len(test_dataset))

length of TRAIN set: 510100
length of VALID set: 63763
length of TEST set: 63763


In [59]:
train_val_loaders = {
    "train": DataLoader(
        dataset=train_dataset,
        batch_size=16,
        shuffle=True,
    ),
    "valid": DataLoader(
        dataset=valid_dataset,
        batch_size=16,
        shuffle=False,
    ),
}

In [60]:
test_loaders = {
    "test": DataLoader(
        dataset=test_dataset,
        batch_size=16,
        shuffle=True,
    ),
}

In [61]:
runner = SupervisedRunner(input_key=("features", "attention_mask"))

# finally, training the model with Catalyst
runner.train(
    model=clf_model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_loaders,
    callbacks=[
        AccuracyCallback(num_classes=10),
        OptimizerCallback(accumulation_steps=4),
    ],
    logdir='../logdir/',
    num_epochs=1,
    verbose=True,
)

1/1 * Epoch (train):   0% 0/31882 [00:00<?, ?it/s]

KeyError: 'targets'