In [1]:
import pandas as pd
import torch
import re
from torchtext import data

In [2]:
df = pd.read_csv('training_with_text.csv')
df = df.loc[df.category.notna()]
df['text_content'] = df.text_content.str.lower()
df['text_content'] = df.text_content.str.replace('\\n', ' ')
df['text_content'] = df.text_content.str.replace("\\'", '')
df['category'] = df.category.str.lower()
df = df[['text_content', 'category']]
df.to_csv('plain_text.csv', index=False)

In [3]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField()

complaints = data.TabularDataset(
    path='plain_text.csv', format='csv',
    fields=[('text', TEXT),
            ('labels', LABEL)],
    skip_header=True)

train, val = complaints.split(split_ratio=0.7)

MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train,
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train)

In [4]:
print(TEXT.vocab.itos[:100])

['<unk>', '<pad>', '.', 'the', ',', 'of', ':', 'to', '?', ' ', 'and', '-', 'a', 'that', 'in', 'i', ')', 'or', 'no', 'on', '(', 'you', '1', 'was', '~', '#', 'not', 'police', 'he', 'be', 'at', 'for', 'officer', 'with', 'by', '/', 'this', 'date', '3', 'as', 'is', '2', '"', 'his', 'unit', ';', 'accused', 'report', 'from', 'time', '_', 'department', 'of?cer', 'an', 'if', '5', 'chicago', 'him', '..', 'name', 'did', 'investigation', 'subject', '0', 'any', 'it', 'your', 'have', 'star', 'will', 'her', 'complainant', 'stated', 'are', 'complaint', 'member', 'were', '4', 'incident', 'allegation', 'she', 'officers', 'statement', 'other', 'attachment', 'page', 'standards', 'my', 'against', 'professional', 'allegations', 'had', 'counsel', '!', 'sustained', 'duty', '[', 'office', 'which', 'charges']


In [6]:
print(LABEL.vocab.itos[:10])

['use of force', 'operation/personnel violations', 'illegal search', 'false arrest', 'verbal abuse', 'domestic', 'criminal misconduct', 'lockup procedures', 'conduct unbecoming (off-duty)', 'traffic']


In [5]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train, val), 
    batch_size = BATCH_SIZE,
    device = device)