In [47]:
import warnings
import itertools
warnings.filterwarnings("ignore")
from joblib import Parallel, delayed

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

from textblob import TextBlob
from textblob.translate import NotTranslated

import pickle
import pandas as pd
import numpy as np
import re
import string
from nltk import word_tokenize

np.random.seed(29)

In [48]:
data_path = 'data/'

In [49]:
ip = r'[\d]+(?:\.[\d]+){3}'
url = r'http\:[^\s]+'
digit = r'[\d]+'
user = r'[Uu]ser\:[^\s]+'

def clean_text(text): 
    text = re.sub(ip,'',text)
    text = re.sub(url,'',text)  
    text = re.sub(digit,'',text)
    text = re.sub(user,'',text)
    text = text.translate(str.maketrans('','',string.punctuation))       
    tokens = word_tokenize(text)   
    tokens = [x for x in tokens if len(x) > 1]
    joined = str(' '.join(tokens))
    return joined

# translating text into target language and back to english (google API)
def translate(comment, language):
    try:
        if hasattr(comment, "decode"):
            comment = comment.decode("utf-8")
        text = TextBlob(comment)
        text = text.translate(to=language)
        text = text.translate(to="en")
        return str(text)
    except Exception:
        return None 
    
# alternative way to cut sentences with length more than X:
# take the first X/2 words and the last X/2 words
def cut_to_x(text, x):
    words = text.split()
    if len(words) > x:
        words = words[:100] + words[-100:]
        return ' '.join(words)
    else:
        return text

# complete text preprocessing according to the predefined params: max_features, maxlen and the exact text column
# this function (1) adds additional column to the train and test dataframes which represents preprocessed text 
# and (2) creates a pickle with tokenizer used in preprocessing
def preprocess_text(max_features, maxlen, train, test, on='tokenized_text', postfix=''):   
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(pd.concat([train[on], test[on]]))
    
    tokenized_train = tokenizer.texts_to_sequences(train[on])
    tokenized_test = tokenizer.texts_to_sequences(test[on])
    
    padded_train = pad_sequences(tokenized_train, maxlen=maxlen)
    padded_test = pad_sequences(tokenized_test, maxlen=maxlen)
    
    cname = 'padded_{0}_{1}_{2}'.format(maxlen, max_features, postfix)
    
    print('New column name: {}'.format(cname))
    
    train[cname] = list(padded_train)
    test[cname] = list(padded_test)
    
    with open('pickles/tokenizer_{0}_{1}_{2}.pickle'.format(maxlen, max_features, postfix), 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [50]:
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
train.replace(np.nan, '', regex=True, inplace=True)
test.replace(np.nan, '', regex=True, inplace=True)

In [51]:
msk = np.random.rand(len(train)) < 0.85
train['split'] = msk
train['lang'] = 'en'

**Augmentation through translation (with Google API)**

In [52]:
for_translation = train[(train['severe_toxic'] > 0) | (train['threat'] > 0) | (train['identity_hate'] > 0)][(train['split'] == True)]
print('Sentences for translation: {}'.format(len(for_translation)))

to_use = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate','split']

parallel = Parallel(200, backend='threading', verbose=0)

for lang in ['de', 'fr', 'es']:
    print('Translate comments using {0} language'.format(lang))
    translated_data = parallel(delayed(translate)(comment, lang) for comment in for_translation['comment_text'].values)
    translated_df = for_translation[to_use]
    translated_df['comment_text'] = translated_data
    translated_df['lang'] = lang
    translated_df = translated_df[pd.notnull(translated_df['comment_text'])]
    frames = [train, translated_df]
    train = pd.concat(frames)

Sentences for translation: 2542
Translate comments using de language
Translate comments using fr language
Translate comments using es language


**Cleaning and tokenization**

In [53]:
train['tokenized_text_upper'] = train['comment_text'].astype(str).apply(clean_text)
test['tokenized_text_upper'] = test['comment_text'].astype(str).apply(clean_text)

train['tokenized_text_lower'] = train['tokenized_text_upper'].astype(str).apply(lambda x: x.lower())
test['tokenized_text_lower'] = test['tokenized_text_upper'].astype(str).apply(lambda x: x.lower())

In [54]:
train['tokenized_text_lower_cut'] = train['tokenized_text_lower'].astype(str).apply(lambda x: cut_to_x(x, 200))
test['tokenized_text_lower_cut'] = test['tokenized_text_lower'].astype(str).apply(lambda x: cut_to_x(x, 200))

train['tokenized_text_upper_cut'] = train['tokenized_text_upper'].astype(str).apply(lambda x: cut_to_x(x, 200))
test['tokenized_text_upper_cut'] = test['tokenized_text_upper'].astype(str).apply(lambda x: cut_to_x(x, 200))

**Experimenting with max_features/maxlen parameters on augmented dataset**

In [55]:
# parameters to be explored
params = {}
params['maxlen'] = [200] #[150,200,250,300]
params['max_features'] = [50000] #[30000,40000,50000,60000]
params['is_lower'] = [True] #[True, False]
params['is_cut'] = [False] #[True, False]

names = ['maxlen','max_features', 'is_lower', 'is_cut']
combinations = itertools.product(*(params[name] for name in names))
combinations = list(combinations)

In [56]:
for i,comb in enumerate(combinations):
    print('Iteration: {}'.format(comb))
    maxlen = comb[0]
    max_features = comb[1]
    is_lower = comb[2]
    is_cut = comb[3]
    if is_lower and is_cut:
        preprocess_text(max_features, maxlen, train, test, on='tokenized_text_lower_cut', postfix='lower_cut')
    elif is_lower and not is_cut:
        preprocess_text(max_features, maxlen, train, test, on='tokenized_text_lower', postfix='lower')
    elif not is_lower and is_cut:
        preprocess_text(max_features, maxlen, train, test, on='tokenized_text_upper_cut', postfix='upper_cut')
    elif not is_lower and not is_cut:
        preprocess_text(max_features, maxlen, train, test, on='tokenized_text_upper', postfix='upper')        

Iteration: (200, 50000, True, False)
New column name: padded_200_50000_lower


In [57]:
train.to_csv(data_path + 'train_pre.csv',index=False)
test.to_csv(data_path + 'test_pre.csv',index=False)