In [2]:
import logging
from pathlib import Path
from typing import List, Mapping, Tuple

import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from catalyst.utils import set_global_seed
from data import TextClassificationDataset

In [4]:
MODEL_NAME = 'distilbert-base-uncased'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [34]:
input_text = 'MOTION for Protective Order Re HIPAA by Defendants Marvin Powers'

In [35]:
output_dict = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True, #adding special tokens like MASK, CLS, PAD
    padding="max_length", #will take a text and enlarge it to length of 16
    max_length=16,
    return_tensors="pt",
    truncation=True,#long texts will be cut to 16 words or technically wordpiece tokens 
    return_attention_mask=True,
)

In [36]:
output_dict
#input_ids are our words encoded into numbers and #attention mask is just where 1's are our words and the 0's are padding 

{'input_ids': tensor([[  101,  4367,  2005,  9474,  2344,  2128,  5099, 11057,  2011, 16362,
         13748,  4204,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [26]:
voc = tokenizer.get_vocab()

In [27]:
len(voc)

30522

In [29]:
inv_vocab = {v:k for (k, v) in voc.items()}

In [37]:
[inv_vocab[i] for i in output_dict['input_ids'].tolist()[0]] 

['[CLS]',
 'motion',
 'for',
 'protective',
 'order',
 're',
 'hip',
 '##aa',
 'by',
 'defendants',
 'marvin',
 'powers',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Statistics for DataSet 
Kaggle Notebook example
https://www.kaggle.com/kashnitsky/distillbert-catalyst-amazon-product-reviews

In [44]:
train_df = pd.read_csv('../data/motions_laws/nwu_docket_entries.csv')
train_df.head()

Unnamed: 0,legacyid,date,number,description
0,IL-CDCT1:15CV01001,03/10/2015,4.0,NOTICE of Appearance of Attorney by Joseph S T...
1,IL-CDCT1:15CV01001,04/23/2015,9.0,NOTICE of Appearance of Attorney by Christophe...
2,IL-CDCT1:15CV01001,01/04/2016,11.0,Joint MOTION for Extension of Time to Complete...
3,IL-CDCT1:15CV01001,01/11/2016,,TEXT ORDER granting <gil>11</gil> Motion for...
4,IL-CDCT1:15CV01001,04/22/2016,,ORAL MOTION by Attorney Busey for continuance ...


In [60]:
series = train_df['legacyid'].value_counts()
series['IL-CDCT1:15CV01001']

23

In [47]:
len(train_df['number'].value_counts())

999

In [45]:
train_df['description'].apply(lambda s: len(s.split())).describe()

count    637626.000000
mean         50.278823
std          47.947418
min           3.000000
25%          23.000000
50%          35.000000
75%          59.000000
max        1553.000000
Name: description, dtype: float64