In [23]:
import logging
from pathlib import Path
from typing import List, Mapping, Tuple

import pandas as pd
import torch as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModel
from catalyst.utils import set_global_seed
from data import TextClassificationDataset

In [2]:
MODEL_NAME = 'distilbert-base-uncased'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
input_text = 'MOTION for Protective Order Re HIPAA by Defendants Marvin Powers'

In [6]:
output_dict = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True, #adding special tokens like MASK, CLS, PAD
    padding="max_length", #will take a text and enlarge it to length of 16
    max_length=16,
    return_tensors="pt",
    truncation=True,#long texts will be cut to 16 words or technically wordpiece tokens 
    return_attention_mask=True,
)

In [7]:
output_dict
#input_ids are our words encoded into numbers and #attention mask is just where 1's are our words and the 0's are padding 

{'input_ids': tensor([[  101,  4367,  2005,  9474,  2344,  2128,  5099, 11057,  2011, 16362,
         13748,  4204,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [8]:
voc = tokenizer.get_vocab()

In [9]:
voc.get('yo')

10930

In [10]:
len(voc)

30522

In [11]:
inv_vocab = {v:k for (k, v) in voc.items()}

In [12]:
[inv_vocab[i] for i in output_dict['input_ids'].tolist()[0]] 

['[CLS]',
 'motion',
 'for',
 'protective',
 'order',
 're',
 'hip',
 '##aa',
 'by',
 'defendants',
 'marvin',
 'powers',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Statistics for DataSet 
Kaggle Notebook example
https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

In [13]:
train_df = pd.read_csv('../data/motions_laws/nwu_docket_entries.csv')
train_df.head()

Unnamed: 0,legacyid,date,number,description
0,IL-CDCT1:15CV01001,03/10/2015,4.0,NOTICE of Appearance of Attorney by Joseph S T...
1,IL-CDCT1:15CV01001,04/23/2015,9.0,NOTICE of Appearance of Attorney by Christophe...
2,IL-CDCT1:15CV01001,01/04/2016,11.0,Joint MOTION for Extension of Time to Complete...
3,IL-CDCT1:15CV01001,01/11/2016,,TEXT ORDER granting <gil>11</gil> Motion for...
4,IL-CDCT1:15CV01001,04/22/2016,,ORAL MOTION by Attorney Busey for continuance ...


In [35]:
df = train_df.groupby('legacyid')
df.head()

Unnamed: 0,legacyid,date,number,description
0,IL-CDCT1:15CV01001,03/10/2015,4.0,NOTICE of Appearance of Attorney by Joseph S T...
1,IL-CDCT1:15CV01001,04/23/2015,9.0,NOTICE of Appearance of Attorney by Christophe...
2,IL-CDCT1:15CV01001,01/04/2016,11.0,Joint MOTION for Extension of Time to Complete...
3,IL-CDCT1:15CV01001,01/11/2016,,TEXT ORDER granting <gil>11</gil> Motion for...
4,IL-CDCT1:15CV01001,04/22/2016,,ORAL MOTION by Attorney Busey for continuance ...
...,...,...,...,...
637599,WI-WDCT3:19CV01066,01/08/2020,3.0,Motion to Admit Bradley Bodiford Pro Hac Vice....
637600,WI-WDCT3:19CV01066,01/08/2020,4.0,** TEXT ONLY ORDER ** ORDER granting <gil>3</...
637601,WI-WDCT3:19CV01066,01/28/2020,7.0,Motion for Extension of Time by Plaintiff Step...
637602,WI-WDCT3:19CV01066,01/31/2020,8.0,ORDER denying as premature <gil>7</gil> Moti...


In [14]:
series = train_df['legacyid'].value_counts()
series['IL-CDCT1:15CV01001']

23

In [15]:
len(train_df['number'].value_counts())

999

In [16]:
train_df['description'].apply(lambda s: len(s.split())).describe()

count    637626.000000
mean         50.278823
std          47.947418
min           3.000000
25%          23.000000
50%          35.000000
75%          59.000000
max        1553.000000
Name: description, dtype: float64

In [17]:
#sklearn -> train_test_split 
from sklearn.model_selection import train_test_split

In [21]:
#another function we can use to split dataset
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [41]:
train, remaining = train_test_split(train_df, train_size=.8)
valid, test = train_test_split(remaining, test_size=0.5)

In [48]:
print(train.shape)
print(valid.shape)
print(test.shape)
assert train.shape[0] + valid.shape[0] + test.shape[0] == len(train_df)

(510100, 4)
(63763, 4)
(63763, 4)


In [50]:
train_file = Path('../data/motions_laws/train.csv')
valid_file = Path('../data/motions_laws/valid.csv')
test_file = Path('../data/motions_laws/test.csv')
if not train_file.is_file(): 
    train.to_csv('../data/motions_laws/train.csv', index=False)
if not valid_file.is_file():
    valid.to_csv('../data/motions_laws/valid.csv', index=False)
if not train_file.is_file():
    test.to_csv('../data/motions_laws/test.csv', index=False)