In [1]:
import joblib
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src.utils import encode_text

In [11]:
import torch
torch.__version__

'1.8.0'

## Load Data, Encode Text

In [2]:
full_df = pd.read_csv('data/raw_combined.csv')
full_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,all,pct_punctuation
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0.041667
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0.107143
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0.025751
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0.040193
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0.074627


In [3]:
vocab, encoded_text = encode_text(
    full_df, 
    use_cache=False,
    max_length=500,
    vocab_length=10000
)

In [4]:
# should be (n comments, max_length + 1)
encoded_text.shape

(223549, 501)

In [6]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
                 'insult', 'identity_hate']
labels = full_df[label_columns].values

full_data = np.concatenate([labels, encoded_text], axis=1)

## Train / Val / Test Split

In [14]:
data_dir = 'processed/'
# save vocab to each so it's available
for subdir in ['train', 'val', 'test']:
    directory = os.path.join(data_dir, subdir)
    os.makedirs(directory, exist_ok=True)
    with open(os.path.join(directory, 'vocab.joblib'), 'wb') as f:
        joblib.dump(vocab, f)

In [32]:
full_data.sum(axis=0)

array([    21384,      1962,     12140,       689,     11304,      2117,
         7744492, 213261217, 223964859, 216712792, 213177422, 205209873,
       201224153, 191998815, 185584467, 178825053, 170897287, 164096046,
       158443177, 150738627, 145606809, 138269683, 133714111, 127896100,
       122244228, 119127843, 113033385, 110180465, 105880091, 102308451,
        98500116,  95175741,  90900924,  88345281,  85230909,  82705616,
        79894987,  78157806,  74878281,  71368929,  70190487,  67232490,
        66367736,  64637938,  62539568,  59918456,  58483573,  56881980,
        56152450,  54191132,  52627351,  50914997,  50218546,  48968241,
        47225603,  46110606,  44647032,  43271547,  42622661,  42044759,
        41495751,  40631647,  39290787,  37936104,  37804002,  36650875,
        36425726,  35242451,  35147855,  33520883,  33115700,  31984577,
        31687274,  31255952,  31005558,  29999140,  29854503,  28476173,
        28412655,  28079409,  27442453,  26639072, 

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
# split out test first
train, test = train_test_split(
    full_data, 
    test_size=0.1, 
    random_state=1,
    # make sure we have some of each class in 
    # each set. this is kind of a hack, looking
    # for the max class label for each, but it should
    # be good enough
    stratify=full_data[:, :6].argmax(axis=1)
)

with open(os.path.join(data_dir, 'test/data.joblib'), 'wb') as f:
    joblib.dump(test, f)

train, val = train_test_split(
    train,
    test_size=0.2,
    random_state=1,
    stratify=train[:, :6].argmax(axis=1)
)


with open(os.path.join(data_dir, 'train/data.joblib'), 'wb') as f:
    joblib.dump(train, f)
with open(os.path.join(data_dir, 'val/data.joblib'), 'wb') as f:
    joblib.dump(val, f)

print(train.shape, val.shape, test.shape)

(160955, 507) (40239, 507) (22355, 507)


In [45]:
train.sum(axis=0)[:6]

array([15326,  1405,  8738,   493,  8105,  1524])

In [30]:
has_positive = train[:, :6].max(axis=1)
has_positive.mean()

0.10025162312447579

In [29]:
 train[:, :6].max(axis=1).shape

(160955,)