### This file seeks to preprocess the text for our Toxicity Classification kernal ##### Adapted from: 
##### https://www.kaggle.com/fizzbuzz/toxic-data-preprocessing
##### and
##### https://www.kaggle.com/swarnim97/jigsaw-toxicity-using-cnn1d-and-cudnnlstm



In [13]:
import pandas as pd
import numpy as np
import copy
import re
from keras.preprocessing.text import text_to_word_sequence
# from nltk import WordNetLemmatizer

In [14]:
# """ Road Map
# 1 make all words lowercase
# 2 trim repeating words
# 3 remove stopwords?
# 4 trim repeating letters (where there are more than 2 repeating letters)
# 5 use patterns to correct spelling of common words of interest 
# 6 remove/substitute non-typical characters 
# 7 remove unnessesary white space
# 8 https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

# """

In [15]:
train = pd.read_csv("data/train.csv",)
test = pd.read_csv("data/test.csv")


In [16]:
class BaseTokenizer(object):
    def process_text(self, text):
        raise NotImplemented

    def process(self, texts):
        for text in texts:
            yield self.process_text(text)

In [17]:

contraction_mapping = {"ain't": "is not", "aren't": "are not",
                       "can't": "cannot", "'cause": "because", 
                       "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", 
                       "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
                       "haven't": "have not", "he'd": "he would","he'll": "he will", 
                       "he's": "he is", "how'd": "how did", "how'd'y": "how do you", 
                       "how'll": "how will", "how's": "how is",  "I'd": "I would",
                       "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                       "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                       "i'll": "i will",  "i'll've": "i will have","i'm": "i am",
                       "i've": "i have", "isn't": "is not", "it'd": "it would", 
                       "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                       "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have",
                       "mustn't": "must not", "mustn't've": "must not have", 
                       "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", 
                       "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", 
                       "she'd": "she would", "she'd've": "she would have", 
                       "she'll": "she will", "she'll've": "she will have", 
                       "she's": "she is", "should've": "should have", "shouldn't": "should not",
                       "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                       "this's": "this is","that'd": "that would", "that'd've": "that would have",
                       "that's": "that is", "there'd": "there would", 
                       "there'd've": "there would have", "there's": "there is", 
                       "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                       "they'll": "they will", "they'll've": "they will have", "they're": "they are",
                       "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
                       "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
                       "what'll've": "what will have", "what're": "what are",  "what's": "what is", 
                       "what've": "what have", "when's": "when is", "when've": "when have", 
                       "where'd": "where did", "where's": "where is", "where've": "where have", 
                       "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", 
                       "will've": "will have", "won't": "will not", "won't've": "will not have",
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                       "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                       "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have" }

more_contraction_mapping = {"aint": "is not", "arent": "are not",
                       "cant": "cannot", "cause": "because", 
                       "couldve": "could have", "couldnt": "could not", 
                       "didnt": "did not",  "doesnt": "does not", 
                       "dont": "do not", "hadnt": "had not", "hasnt": "has not", 
                       "havent": "have not", "hed": "he would","hell": "he will", 
                       "hes": "he is", "howd": "how did", "howdy": "how do you", 
                       "howll": "how will", "hows": "how is",  "Id": "I would",
                       "Idve": "I would have", "Ill": "I will", "Illve": "I will have",
                       "Im": "I am", "Ive": "I have", "id": "i would", "idve": "i would have",
                       "ill": "i will",  "illve": "i will have","im": "i am",
                       "ive": "i have", "isnt": "is not", "itd": "it would", 
                       "itdve": "it would have", "itll": "it will", "itllve": "it will have",
                       "its": "it is", "lets": "let us", "maam": "madam", "maynt": "may not", 
                       "mightve": "might have","mightnt": "might not",
                       "mightntve": "might not have", "mustve": "must have",
                       "mustnt": "must not", "mustntve": "must not have", 
                       "neednt": "need not", "needntve": "need not have",
                       "oclock": "of the clock", "oughtnt": "ought not", 
                       "oughtntve": "ought not have", "shant": "shall not",
                       "shant": "shall not", "shantve": "shall not have", 
                       "shed": "she would", "shedve": "she would have", 
                       "shell": "she will", "shellve": "she will have", 
                       "shes": "she is", "shouldve": "should have", "shouldnt": "should not",
                       "shouldntve": "should not have", "sove": "so have","sos": "so as",
                       "thiss": "this is","thatd": "that would", "thatdve": "that would have",
                       "thats": "that is", "thered": "there would", 
                       "theredve": "there would have", "theres": "there is", 
                       "heres": "here is","theyd": "they would", "theydve": "they would have", 
                       "theyll": "they will", "theyllve": "they will have", "theyre": "they are",
                       "theyve": "they have", "tove": "to have", "wasnt": "was not", "wed": "we would",
                       "wedve": "we would have", "well": "we will", "wellve": "we will have",
                       "were": "we are", "weve": "we have", "werent": "were not", "whatll": "what will",
                       "whatllve": "what will have", "whatre": "what are",  "whats": "what is", 
                       "whatve": "what have", "whens": "when is", "whenve": "when have", 
                       "whered": "where did", "wheres": "where is", "whereve": "where have", 
                       "wholl": "who will", "whollve": "who will have", "whos": "who is", 
                       "whove": "who have", "whys": "why is", "whyve": "why have", 
                       "willve": "will have", "wont": "will not", "wontve": "will not have",
                       "wouldve": "would have", "wouldnt": "would not", "wouldntve": "would not have",
                       "yall": "you all", "yalld": "you all would","yalldve": "you all would have",
                       "yallre": "you all are","yallve": "you all have","youd": "you would", 
                       "youdve": "you would have", "youll": "you will", "youllve": "you will have", 
                       "youre": "you are", "youve": "you have" }


In [18]:
RE_PATTERNS = {
    
    ' fuck':
        [
            '(f)(u|[^a-z0-9 ])(c|[^a-z0-9 ])(k|[^a-z0-9 ])([^ ])*',
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k', 'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)', r'f\*',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck', 'f ck'
        ],

    ' ass ':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],

    ' ass hole ':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole'
        ],

    ' bitch ':
        [
            'bitches', 'b[w]*i[t]*ch', 'b!tch', 
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h'
        ],

    ' bastard ':
        [
            'ba[s|z]+t[e|a]+rd'
        ],

    ' transgender':
        [
            'transgender'
        ],

    ' gay ':
        [
            'gay', 'homo'
        ],

    ' cock ':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],

    ' dick ':
        [
            ' dick[^aeiou]', 'd i c k'
        ],

    ' suck ':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],

    ' cunt ':
        [
            'cunt', 'c u n t'
        ],

    ' bull shit ':
        [
            'bullsh\*t', 'bull\$hit'
        ],


    ' jerk ':
        [
            'jerk'
        ],

    ' idiot ':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots' 'i d i o t'
        ],

    ' dumb ':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],

    ' shit ':
        [
            'shitty', '(s)([^a-z ]*)(h)([^a-z ]*)(i)([^a-z ]*)(t)', 'shite', '\$hit', 's h i t', 'sh\*tty',
            'sh\*ty'
        ],

    ' shit hole ':
        [
            'shythole', 'sh\*thole'
        ],

    ' retard ':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],

    ' rape ':
        [
            'raped'
        ],

    ' dumb ass':
        [
            'dumbass', 'dubass'
        ],

    ' ass head':
        [
            'butthead'
        ],

    ' sex ':
        [
            'sexy', 's3x', 'sexuality'
        ],


    ' nigger ':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r'
        ],

    ' shut the fuck up':
        [
            'stfu'
        ],

    ' pussy ':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses'
        ],

    ' faggot ':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],

    ' mother fucker':
        [
             ' motha f', ' mother f', 'motherucker',
        ],

    ' whore ':
        [
            'wh\*\*\*', 'w h o r e'
        ],
    
    ' haha ':
        [
            'ha\*\*\*ha',
        ],
}


In [19]:
def clean_contractions(text):
    specials = ["’", "‘", "´", "`", "'"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    return text
    
def more_clean_contractions(text):
    specials = ["’", "‘", "´", "`", "'"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([more_contraction_mapping[t] if t in more_contraction_mapping else t for t in text.split(" ")])
    return text
    
def rem_special_sym(text):
#     return re.sub('\W+',' ', text)
    pattern = re.compile('([^\s\w]|_)+')
    return pattern.sub('', text)

In [20]:
class PatternTokenizer(BaseTokenizer):
    def __init__(self, lower=True, initial_filters=r"[^a-z0-9!@#\$%\^\&\*_\-,\.' ]", patterns=RE_PATTERNS,
                 remove_repetitions=True):
        self.lower = lower
        self.patterns = patterns
        self.initial_filters = initial_filters
        self.remove_repetitions = remove_repetitions
        

    def process_text(self, text):
        x = self._preprocess(text)
        for target, patterns in self.patterns.items():
            for pat in patterns:
                x = re.sub(pat, target, x)
        x = re.sub(r"[^a-z' ]", ' ', x)
        return x.split()

    def process_ds(self, ds):
        ### ds = Data series
        
        # lower
        ds = copy.deepcopy(ds)
        if self.lower:
            ds = ds.str.lower()
        
        # remove special chars
        if self.initial_filters is not None:
            ds = ds.str.replace(self.initial_filters, ' ')
            
        # looooooooooser = loser
        if self.remove_repetitions:
            pattern = re.compile(r"(.)\1{2,}", re.DOTALL) 
            ds = ds.str.replace(pattern, r"\1")

        for target, patterns in self.patterns.items():
            for pat in patterns:
                ds = ds.str.replace(pat, target)

        ds = ds.str.replace(r"[^a-z' ]", ' ')

        return ds.str.split()
    

    def _preprocess(self, text):
        # lower
        if self.lower:
            text = text.lower()
            
        # remove special chars
        if self.initial_filters is not None:
            text = re.sub(self.initial_filters, ' ', text)

        # neeeeeeeeeerd => nerd
        if self.remove_repetitions:
            pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
            text = pattern.sub(r"\1", text)
        return text 


In [21]:
tokenizer = PatternTokenizer()


In [22]:
sentence = 'Experi\"m\'ent fuc\'kwith youre sentences here!'
sentence = rem_special_sym(sentence)
sentence = more_clean_contractions(sentence)
sentence = " ".join(tokenizer.process_text(sentence))

# sentence = rem_special_sym(sentence)

print(sentence)

experiment fuck you are sentences here


In [23]:
# perform preprocessing
train.head(20)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
5,59859,0.666667,ur a sh*tty comment.,0.047619,0.638095,0.0,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,0.457627,hahahahahahahahhha suck it.,0.050847,0.305085,0.0,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,0.0,FFFFUUUUUUUUUUUUUUU,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
8,239575,0.0,The ranchers seem motivated by mostly by greed...,0.0,0.0,0.0,0.0,0.0,,,...,26662,approved,0,0,0,0,0,0.0,0,4
9,239576,0.0,It was a great show. Not a combo I'd of expect...,0.0,0.0,0.0,0.0,0.0,,,...,26650,approved,0,0,0,1,0,0.0,0,4


In [24]:
train["comment_text"]= train.comment_text.apply(rem_special_sym)
train.head(20)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,This is so cool Its like would you want your m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you This would make my life a lot less a...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem kudos to...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something Ill be able to install on my...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
5,59859,0.666667,ur a shtty comment,0.047619,0.638095,0.0,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,0.457627,hahahahahahahahhha suck it,0.050847,0.305085,0.0,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,0.0,FFFFUUUUUUUUUUUUUUU,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
8,239575,0.0,The ranchers seem motivated by mostly by greed...,0.0,0.0,0.0,0.0,0.0,,,...,26662,approved,0,0,0,0,0,0.0,0,4
9,239576,0.0,It was a great show Not a combo Id of expected...,0.0,0.0,0.0,0.0,0.0,,,...,26650,approved,0,0,0,1,0,0.0,0,4


In [25]:
train["comment_text"]= train.comment_text.apply(more_clean_contractions)
train["comment_text"]= tokenizer.process_ds(train["comment_text"]).str.join(sep=" ")

In [26]:
test["comment_text"] = test.comment_text.apply(rem_special_sym)
test["comment_text"] = test.comment_text.apply(more_clean_contractions)
test["comment_text"] = tokenizer.process_ds(test["comment_text"]).str.join(sep=" ")

In [27]:
def cleaning(text):
    text = str(text).lower()
    text = re.sub(r'\W+',' ',text)
    return text

train_text = train.comment_text.apply(cleaning)
test_text = test.comment_text.apply(cleaning)

In [28]:
train.to_csv("data/train_preprocessed.csv", index=False, encoding="utf8")
test.to_csv("data/test_preprocessed.csv", index=False, encoding="utf8")

In [132]:
train.head(100)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.000000,this is so cool its like would you want your m...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
1,59849,0.000000,thank you this would make my life a lot less a...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
2,59852,0.000000,this is such an urgent design problem kudos to...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
3,59855,0.000000,is this something i will be able to install on...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
4,59856,0.893617,haha you guys are a bunch of losers,0.021277,0.000000,0.021277,0.872340,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.000000,4,47
5,59859,0.666667,ur a shtty comment,0.047619,0.638095,0.000000,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,0.457627,hahahahahahahaha suck it,0.050847,0.305085,0.000000,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,0.000000,fu,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.000000,0,4
8,239575,0.000000,the ranchers seem motivated by mostly by greed...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,26662,approved,0,0,0,0,0,0.000000,0,4
9,239576,0.000000,it was a great show not a combo i would of exp...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,26650,approved,0,0,0,1,0,0.000000,0,4
