In [1]:
import joblib
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src.utils import encode_text

## Load Data, Encode Text

In [2]:
full_df = pd.read_csv('data/raw_combined.csv')
full_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,all,pct_punctuation
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0.041667
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0.107143
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0.025751
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0.040193
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0.074627


In [3]:
vocab, encoded_text = encode_text(
    full_df, 
    use_cache=False,
    max_length=500,
    vocab_length=10000
)

In [4]:
# should be (n comments, max_length + 1)
encoded_text.shape

(223549, 501)

In [5]:
encoded_text.max(axis=0)

array([  500, 10000,  9999, 10001,  9999, 10001, 10001, 10001, 10001,
       10000, 10001, 10001, 10001, 10000, 10001, 10000, 10001, 10001,
       10000, 10001, 10000, 10001, 10001, 10001, 10001, 10001, 10001,
       10001,  9998, 10000, 10000, 10001, 10001, 10001, 10001,  9999,
       10001, 10000,  9997, 10001,  9999,  9999,  9998, 10001, 10000,
       10000, 10001,  9999, 10001,  9999,  9997, 10001,  9995,  9994,
       10001,  9988, 10001, 10000, 10001,  9995, 10001,  9990,  9999,
        9999,  9983,  9987, 10000, 10000,  9995,  9994,  9998, 10000,
        9990,  9996,  9992,  9997,  9999, 10000,  9991,  9989,  9996,
        9998,  9994,  9997, 10000,  9998,  9995,  9998, 10001,  9990,
       10000,  9994,  9993,  9997,  9990,  9999,  9988, 10001,  9999,
        9990,  9994,  9999,  9997,  9998,  9999,  9996,  9974,  9990,
        9988,  9992,  9977, 10001, 10000,  9997,  9992,  9997,  9984,
        9988,  9987,  9993,  9992,  9998, 10000,  9989,  9986,  9994,
        9991,  9992,

In [6]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
                 'insult', 'identity_hate']
labels = full_df[label_columns].values

full_data = np.concatenate([labels, encoded_text], axis=1)

## Train / Test Split

In [7]:
train, test = train_test_split(
    full_data, 
    test_size=0.15, 
    random_state=1
)
print(train.shape, test.shape)

(190016, 507) (33533, 507)


## Save to Files

In [8]:
data_dir = 'processed/'
with open(os.path.join(data_dir, 'train.joblib'), 'wb') as f:
    joblib.dump(train, f)
with open(os.path.join(data_dir, 'test.joblib'), 'wb') as f:
    joblib.dump(test, f)

In [9]:
train[:, 6:].shape

(190016, 501)