In [1]:
# Regular imports
import pandas as pd
import numpy as np
import random
import pickle
import os

# Imports for cleaning
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
data_path = '../data/'
data = pd.read_csv(data_path+'data.csv')

In [3]:
data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for col in target_cols:
    print(data[col].value_counts())

0    144277
1     15294
Name: toxic, dtype: int64
0    157976
1      1595
Name: severe_toxic, dtype: int64
0    151122
1      8449
Name: obscene, dtype: int64
0    159093
1       478
Name: threat, dtype: int64
0    151694
1      7877
Name: insult, dtype: int64
0    158166
1      1405
Name: identity_hate, dtype: int64


In [5]:
def dump_data(data_sets, data_path='../data/'):
    for data in data_sets:
        if not os.path.exists(data_path):
            makedirs(data_path)
        file_path = data_path+'{}.pkl'
        if not os.path.isfile(file_path):
            pickle.dump(data_sets[data], open(file_path.format(data), 'wb'))

In [6]:
def split_data(data, is_clean=0):
    # Split data into train (0.8), validation (0.2), and test (0.2) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(data.drop(target_cols, axis=1), data[target_cols], test_size=0.2, random_state=1337)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=1337)
    
    # Order (separate) train+val set
    train_idx, val_idx = X_train.index.values, X_val.index.values
    X_train_val = X_train_val.loc[train_idx].append(X_train_val.loc[val_idx])
    y_train_val = y_train_val.loc[train_idx].append(y_train_val.loc[val_idx])
    
    data_sets = {
        'X_train': X_train,
        'X_val': X_val,
        'X_train_val': X_train_val,
        'X_test': X_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_train_val': y_train_val,
        'y_test': y_test
    }
    
    clean = '_clean' if is_clean else ''
    data_sets = {data+clean: data_sets[data] for data in data_sets}
    return data_sets

In [7]:
def oversample_data(data_sets, oversampling_cols, oversampling_target_cols, is_clean=0):
    clean = '_clean' if is_clean else ''
    for (X, y) in oversampling_cols:
        X, y = X+clean, y+clean
        X_os, y_os = X+'_os', y+'_os'
        total_rows = data_sets[y].shape[0]
        oversampling_indices = []
        for col in oversampling_target_cols:
            count = data_sets[y][data_sets[y][col] == 1].shape[0]
            while count/total_rows < 0.05:
                indices_1 = data_sets[y][data_sets[y][col] == 1].index.values
                rand_index = np.random.choice(indices_1)
                oversampling_indices.append(rand_index)
                count += 1
        
        data_sets[X_os] = data_sets[X].append(data_sets[X].loc[oversampling_indices])
        data_sets[y_os] = data_sets[y].append(data_sets[X].loc[oversampling_indices])
        data_sets[X_os], data_sets[y_os] = shuffle(data_sets[X_os], data_sets[y_os])
    
    return data_sets

In [8]:
# Set parameter indicating processing for uncleaned data
is_clean = 0
clean = ''

In [9]:
data_sets = split_data(data, is_clean)

In [10]:
oversampling_target_cols = ['severe_toxic', 'threat', 'identity_hate']
data_sets = oversample_data(data_sets, [('X_train', 'y_train'), ('X_train_val', 'y_train_val')], \
                            oversampling_target_cols, is_clean)

In [11]:
dump_data(data_sets, data_path)

In [12]:
# All preprocessing that follows is for cleaned data
is_clean = 1
clean = '_clean'

In [13]:
spacy_en = spacy.load('en')
parser = English()
punctuations = string.punctuation

In [14]:
NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
 "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
 "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
 "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
 "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
 "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
 "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
 "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

In [15]:
stopwords = STOP_WORDS.copy()
for word in STOP_WORDS:
    if word in NEGATE:
        stopwords.remove(word)

In [16]:
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return ' '.join(tokens)

In [17]:
def clean_data(data, col):
    clean_text = []
    for text in data[col]:
        clean_text.append(spacy_tokenizer(text))
    return clean_text

In [18]:
data['comment_text'] = clean_data(data, 'comment_text')

In [19]:
dump_data({'data_clean': data}, data_path)

In [20]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit username hardcore metallica f...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww match background colour -pron- seemingly...,0,0,0,0,0,0
2,000113f07ec002fd,hey man -pron- not try edit war -pron- guy con...,0,0,0,0,0,0
3,0001b41b1c6bb37e,not real suggestion improvement wonder section...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0


In [21]:
data.isnull().sum().sum()

0

In [22]:
data_sets = split_data(data, is_clean)

In [23]:
oversampling_target_cols = ['severe_toxic', 'threat', 'identity_hate']
data_sets = oversample_data(data_sets, [('X_train', 'y_train'), ('X_train_val', 'y_train_val')], \
                            oversampling_target_cols, is_clean)

In [24]:
dump_data(data_sets, data_path)