In [1]:
import pandas as pd

from collections import Counter
import re
 
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.tokenizer import Tokenizer

import en_core_web_lg
nlp = en_core_web_lg.load()

In [2]:
tokenizer = Tokenizer(nlp.vocab)

In [3]:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

In [4]:
df = pd.read_csv("toxic-train.csv")

In [5]:
import string
printable = set(string.printable)

def cleanup(x):
    x = " ".join(x.split("\\n"))
    x = " ".join(x.split("\\t"))
    x = " ".join(x.split("\\r"))
    x = " ".join(x.split("\n"))
    x = " ".join(x.split("\t"))
    x = " ".join(x.split("\r"))
    x = " ".join(x.split(","))
    x = " ".join(x.split("."))
    x = " ".join(x.split("!"))
    x = " ".join(x.split("?"))
    x = x.lower()
    x = "".join(list(filter(lambda c: c in printable, x)))
    x = " ".join(filter(lambda z: z != '', x.split(" ")))
    return x

In [6]:
%time df['comment_text'] = df['comment_text'].apply(cleanup)

Wall time: 12.6 s


In [124]:
%time df['lemmas'] = df['comment_text'].apply(get_lemmas)

Wall time: 1h 28min 2s


In [125]:
df.to_csv("toxic-train-w-lems.csv")

In [27]:
from symspellpy import SymSpell

corpus = []
for line in df['lemmas'].values:
    tokens = [token for token in line if len(token) > 0]
    corpus.extend(tokens)
    
with open('toxicCorpus.txt', 'w') as filehandle:
        for listitem in corpus:
            filehandle.write('%s\n' % listitem)

symspell = SymSpell()
symspell.create_dictionary(corpus="toxicCorpus.txt")

True

In [28]:
def correctSpelling(x):
    corr = symspell.lookup(x, verbosity=10)
    if len(corr) > 0:
        return corr[0].term
    
    return x

df['lemmas'] = [ [ correctSpelling(lemma) for lemma in line]
                 for line in df['lemmas'].values ]

In [29]:
def getTop(df, par, n=100, min_occurances=10):
    wcT = count(df[df[par] == 1]['lemmas'])
    wcT = wcT[wcT['count'] >= min_occurances]
    wcF = count(df[df[par] == 0]['lemmas'])
    wcF = wcF[wcF['count'] >= min_occurances]

    wc = pd.merge(wcT, wcF, how='inner', on='word', suffixes=('_true', '_false'))
    wc['more_true'] = wc['pct_total_true'] - wc['pct_total_false']
    
    return wc.sort_values(by ='more_true', ascending=False)[['word', 'more_true']].head(n)

In [30]:
pars = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [31]:
for p in pars:
    tp = getTop(df, p)['word'].values
    
    with open(p+'WordFile.txt', 'w') as filehandle:
        for listitem in tp:
            filehandle.write('%s\n' % cleanup(listitem))

In [8]:
df2 = pd.read_csv("toxic-train-w-lems.csv")
df['lemmas'] = df2['lemmas'].apply(eval)