In [1]:
import logging
from pathlib import Path
from typing import List, Mapping, Tuple

import pandas as pd
import torch as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModel
from catalyst.utils import set_global_seed
from data import TextClassificationDataset
#sklearn -> train_test_split 
from sklearn.model_selection import train_test_split

In [3]:
MODEL_NAME = 'distilbert-base-uncased'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
input_text = 'MOTION for Protective Order Re HIPAA by Defendants Marvin Powers'

In [7]:
output_dict = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True, #adding special tokens like MASK, CLS, PAD
    padding="max_length", #will take a text and enlarge it to length of 16
    max_length=16,
    return_tensors="pt",
    truncation=True,#long texts will be cut to 16 words or technically wordpiece tokens 
    return_attention_mask=True,
)

In [8]:
output_dict
#input_ids are our words encoded into numbers and #attention mask is just where 1's are our words and the 0's are padding 

{'input_ids': tensor([[  101,  4367,  2005,  9474,  2344,  2128,  5099, 11057,  2011, 16362,
         13748,  4204,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [9]:
voc = tokenizer.get_vocab()

In [10]:
voc.get('yo')

10930

In [11]:
len(voc)

30522

In [12]:
inv_vocab = {v:k for (k, v) in voc.items()}

In [13]:
[inv_vocab[i] for i in output_dict['input_ids'].tolist()[0]] 

['[CLS]',
 'motion',
 'for',
 'protective',
 'order',
 're',
 'hip',
 '##aa',
 'by',
 'defendants',
 'marvin',
 'powers',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Statistics for DataSet and Data Preparation
Kaggle Notebook example
https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

In [2]:
train_df = pd.read_csv('../data/motions_laws/balanced_set.csv')
train_df.head()

Unnamed: 0,legacyid,date,number,description,target
0,IL-CDCT1:15CV01009,05/26/2015,22.0,motion to dismiss for failure to state a claim...,Motion to Dismiss
1,IL-CDCT1:15CV01013,02/27/2015,12.0,motion to dismiss the counterclaim and rule 12...,Motion to Dismiss
2,IL-CDCT1:15CV01023,01/30/2015,14.0,motion to dismiss by defendants geronimo wind ...,Motion to Dismiss
3,IL-CDCT1:15CV01023,01/30/2015,23.0,motion to dismiss by defendant general service...,Motion to Dismiss
4,IL-CDCT1:15CV01027,03/19/2015,8.0,motion to dismiss by defendants city of fairbu...,Motion to Dismiss


In [None]:
# series = train_df['legacyid'].value_counts()
# series['IL-CDCT1:15CV01001']

In [None]:
# len(train_df['number'].value_counts())

In [3]:
train_df['description'].apply(lambda s: len(s.split())).describe()

count    3030.000000
mean       32.359076
std        25.795596
min         8.000000
25%        20.000000
50%        26.000000
75%        36.000000
max       377.000000
Name: description, dtype: float64

In [4]:
#NUMBER OF DIFFERENT MOTIONS*************
different_motions = train_df['target'].value_counts()
print(len(different_motions))

3


In [5]:
different_motions

Motion to Dismiss               1151
Other                           1062
Motion for Summary Judgement     817
Name: target, dtype: int64

In [6]:
train, remaining = train_test_split(train_df, train_size=.8)
valid, test = train_test_split(remaining, test_size=0.5)

In [7]:
print(train.shape)
print(valid.shape)
print(test.shape)
assert train.shape[0] + valid.shape[0] + test.shape[0] == len(train_df)

(2424, 5)
(303, 5)
(303, 5)


In [8]:
train_file = Path('../data/motions_laws/train.csv')
valid_file = Path('../data/motions_laws/valid.csv')
test_file = Path('../data/motions_laws/test.csv')
if not train_file.is_file(): 
    train.to_csv('../data/motions_laws/train.csv', index=False)
if not valid_file.is_file():
    valid.to_csv('../data/motions_laws/valid.csv', index=False)
if not test_file.is_file():
    test.to_csv('../data/motions_laws/test.csv', index=False)

In [9]:
SEQ_LENGTH = 32
train_dataset = TextClassificationDataset(
    texts=train['description'].values.tolist(),
    labels=train['target'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TRAIN set:', len(train_dataset))

valid_dataset = TextClassificationDataset(
    texts=valid['description'].values.tolist(),
    labels=valid['target'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of VALID set:', len(valid_dataset))

test_dataset = TextClassificationDataset(
    texts=test['description'].values.tolist(),
    labels=test['target'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TEST set:', len(test_dataset))

NameError: name 'MODEL_NAME' is not defined

In [12]:
train_dataset.label_dict

{'Motion Requesting Early Termination of Probation': 0,
 'Motion for Attorney Representation': 1,
 'Motion for Default Judgment': 2,
 'Motion for Extension': 3,
 'Motion for Judgment (as a Matter of Law)': 4,
 'Motion for Leave': 5,
 'Motion for Order': 6,
 'Motion for Removal in Custody': 7,
 'Motion for Summary Judgment': 8,
 'Motion for Voluntary Dismissal': 9,
 'Motion for a Protective Order': 10,
 'Motion in Limine': 11,
 'Motion to Amend': 12,
 'Motion to Compel': 13,
 'Motion to Continue': 14,
 'Motion to Dismiss': 15,
 'Motion to Exclude': 16,
 'Motion to File Under Seal': 17,
 'Motion to Modify Conditions for Release': 18,
 'Motion to Quash': 19,
 'Motion to Strike': 20,
 'Motion to Substitute': 21,
 'Motion to Suppress': 22,
 'Motion to Transfer a Case': 23,
 'Motion to Transfer a Prisoner': 24,
 'Motion to Unseal': 25,
 'Motion to Withdraw': 26,
 'No Motion Here': 27,
 'Other Motion Not Listed Here': 28}

In [23]:
train_df.iloc[17].values

array(["MOTION by Plaintiff Victor Santana to amend/correct other 70 Plaintiff's Motion to Amend Answer to Defendants Thomas A. Jaconetty, Joseph Berrios and Cook County Board of Review's First Requests for Admission (Attachments: # 1 Exhibit Plaintiff's Answers to Defendant's First Request to Admit)(de Silva, R.) (Entered: 08/06/2010)",
       'Motion to File Under Seal'], dtype=object)

In [21]:
train_dataset[17]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]]), 'features': tensor([  101,  2344,  2004,  2000, 17183, 16843, 13109,  6633,  6562,  1024,
        13474,  3544,  1999,  3433,  2000,  6545,  2006,  2142,  2163,  1005,
         2353,  4367,  2005,  1037,  3627,  2000,  2265,  3426,  9800,  1012,
        13474,   102]), 'targets': tensor(27)}

In [24]:
train_val_loaders = {
    "train": DataLoader(
        dataset=train_dataset,
        batch_size=16,
        shuffle=True,
    ),
    "valid": DataLoader(
        dataset=valid_dataset,
        batch_size=16,
        shuffle=False,
    ),
}

In [25]:
mini_batch = next(iter(train_val_loaders['train']))

In [26]:
mini_batch.keys()

dict_keys(['attention_mask', 'features', 'targets'])

In [27]:
mini_batch['features'].size()

torch.Size([16, 32])

## Creating the Model

In [28]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel
from model import BertForSequenceClassification

In [None]:
MODEL_NAME

In [36]:
config = AutoConfig.from_pretrained(
    MODEL_NAME, num_labels=10
)

In [37]:
model = AutoModel.from_pretrained(MODEL_NAME, config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
bert_output = model(
                input_ids = mini_batch['features'],
                attention_mask=mini_batch['attention_mask']
)

In [None]:
type(bert_output), len(bert_output)

In [None]:
bert_output[0].size() #(batch size, sequence_length, embedding space)

In [None]:
seq_output = bert_output[0]  # (bs, seq_len, dim)


In [None]:
pooled_output = seq_output.mean(axis=1)
pooled_output.size()

In [None]:
classifier = nn.Linear(768, 10)
dropout = nn.Dropout(0.3)

In [None]:
pooled_output = dropout(pooled_output)  # (bs, dim)
scores = classifier(pooled_output)  # (bs, num_classes)
scores.size()

In [29]:
clf_model = BertForSequenceClassification(pretrained_model_name=MODEL_NAME, num_classes=len(different_motions))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
clf_model(features=mini_batch['features'], attention_mask=mini_batch['attention_mask'])

tensor([[-2.2639e-02,  1.2949e-01, -3.9254e-01, -1.6854e-02, -5.3484e-03,
         -8.1669e-02, -1.1446e-01,  2.6293e-01, -1.2415e-01,  2.4378e-02,
          1.0092e-01, -4.9385e-01,  1.4566e-01,  4.8445e-01, -1.6468e-01,
         -2.8870e-01,  3.9481e-01, -7.4054e-04, -4.0464e-01, -9.8088e-02,
          1.5933e-01,  1.6350e-01, -2.1154e-01,  2.2107e-01,  1.9878e-01,
          9.8776e-02,  5.6608e-02, -6.8970e-02, -5.9388e-01],
        [-1.2809e-01, -2.0702e-01,  2.8406e-02,  4.0913e-02,  2.1825e-02,
         -1.3228e-01, -3.6522e-02,  1.7669e-01, -1.0154e-01,  9.4971e-03,
          1.2146e-02, -8.9340e-02,  1.1837e-02,  2.7034e-01,  1.6256e-01,
         -2.6777e-01,  2.0420e-01,  9.1835e-02, -4.4958e-01,  1.1288e-01,
          3.9209e-02,  3.6933e-01, -3.6268e-01,  2.4374e-01,  4.9684e-01,
         -4.6326e-02, -3.7255e-02, -7.6579e-02, -3.4536e-01],
        [-4.2630e-02,  1.1766e-01,  2.9289e-02,  6.5125e-02, -1.0120e-02,
         -6.1665e-02, -1.9023e-02,  2.0471e-01, -2.9025e-01, -

## Training

In [31]:
import torch
import yaml
import transformers
import catalyst
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
    AccuracyCallback,
    CheckpointCallback,
    InferCallback,
    OptimizerCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed

In [32]:
torch.__version__

'1.10.2+cpu'

In [33]:
transformers.__version__

'4.16.2'

In [34]:
catalyst.__version__

'20.08.2'

In [39]:
# specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=3e-5
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [40]:
set_global_seed(17)
prepare_cudnn(deterministic=True)

In [41]:
SEQ_LENGTH = 32
train_dataset = TextClassificationDataset(
    texts=train['description'].values.tolist(),
    labels=train['target'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TRAIN set:', len(train_dataset))

valid_dataset = TextClassificationDataset(
    texts=valid['description'].values.tolist(),
    labels=valid['target'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of VALID set:', len(valid_dataset))

test_dataset = TextClassificationDataset(
    texts=test['description'].values.tolist(),
    max_seq_length=SEQ_LENGTH,
    model_name=MODEL_NAME,
)
print('length of TEST set:', len(test_dataset))

length of TRAIN set: 2846
length of VALID set: 356
length of TEST set: 356


In [42]:
train_val_loaders = {
    "train": DataLoader(
        dataset=train_dataset,
        batch_size=16,
        shuffle=True,
    ),
    "valid": DataLoader(
        dataset=valid_dataset,
        batch_size=16,
        shuffle=False,
    ),
}

In [43]:
test_loaders = {
    "test": DataLoader(
        dataset=test_dataset,
        batch_size=16,
        shuffle=True,
    ),
}

In [44]:
runner = SupervisedRunner(input_key=("features", "attention_mask"))

# finally, training the model with Catalyst
runner.train(
    model=clf_model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_loaders,
    callbacks=[
        AccuracyCallback(num_classes=10),
        OptimizerCallback(accumulation_steps=4),
    ],
    logdir='../logdir/',
    num_epochs=1,
    verbose=True,
)


1/1 * Epoch (train):   0% 0/178 [00:00<?, ?it/s]

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.