## Toxic Comment Classification
#### Classifying online comments into types of toxicity

Import packages + train and test sets

In [17]:
import nltk, string, re
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import bs4 as bs
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet

In [18]:
train = pd.read_csv('./train.csv').sample(1000)
test = pd.read_csv('./test.csv').sample(1000)
train.name = 'train'
test.name = 'test'

### Pre-processing
Remove unwanted characters:
\n characters, all punctuation, double spaces, leading/trailing spaces, html tags (via BeautifulSoup). 
Then split on spaces, i.e. tokenize.

In [19]:
translator_punct = str.maketrans('', '', string.punctuation)
for df in [train,test]:
    df['comment_text'] = df['comment_text'].str.replace(r'\n',r' ') # remove newline characters
    df['comment_text'] = [re.sub('[^a-zA-Z]',' ',x) for x in df['comment_text']] # remove all punctuation, keep letters
    df['comment_text'] = df['comment_text'].str.replace(r' +',r' ') # remove double spaces
    df['comment_text'] = df['comment_text'].str.strip() # remove leading and trailing spaces
    df['comment_text'] = [bs.BeautifulSoup(x,features='lxml').text for x in df['comment_text']] # remove html tags 
    df['comment_text'] = df['comment_text'].str.split() # split by spaces, ie tokenize

Delete stopwords using NLTK (Natural Language Toolkit) - this is separated due to runtime

In [20]:
for df in [train,test]:
    for index,row in df.iterrows():
        df.set_value(index,'comment_text',[w for w in [x.lower() for x in row['comment_text']] if w not in stopwords.words('english')])
        #if (index+1)%500 == 0:
            #print('Done with %d reviews' %(index+1),df.name)

  This is separate from the ipykernel package so we can avoid doing imports until


### Stemming
This will take the stem of common words and treat them the same, e.g. gives, given, and giving become give

In [7]:
train_stemmed = train.copy()
test_stemmed = test.copy()
train_stemmed.name = 'train_stemmed'
test_stemmed.name = 'test_stemmed'

In [8]:
import sys
sys.setrecursionlimit(200000)

In [9]:
ps = PorterStemmer()

for df in [train_stemmed,test_stemmed]:
    for index,row in df.iterrows():
        #if( (index+1)%1000 == 0 ):
            #print("Done with %d reviews" %(index+1), df.name);
        df.set_value(index,'comment_text',[ps.stem(x) for x in row['comment_text']])

  import sys


### Lemmatizing
This will group together common words based on their meaning

In [10]:
train_lemmatized = train.copy()
test_lemmatized = test.copy()
train_lemmatized.name = 'train_lemmatized'
test_lemmatized.name = 'test_lemmatized'

In [11]:
# returns part of speech for a word
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [12]:
wnl = WordNetLemmatizer()

for df in [train_lemmatized,test_lemmatized]:
    for index,row in df.iterrows():
        #if((index+1)%1000 == 0):
            #print("Done with %d reviews" %(index+1),df.name);
        wnl_stems = []
        for pair in pos_tag(row['comment_text']):
            wnl_stems.append(wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1])))
        df.set_value(index,'comment_text',' '.join(wnl_stems))
    df['comment_text'] = df['comment_text'].str.split()

  # Remove the CWD from sys.path while we load stuff.


### Bag of Words
Via sklearn's countvectorizer. This counts the number of times each word appears and uses that as the feature set with the frequency per comment being the row values.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(text_list):
    vect = CountVectorizer(analyzer='word')
    comments = [' '.join(x) for x in text_list]
    vect.fit(comments)
    bag = vect.transform(comments)
    return pd.DataFrame(bag.toarray(), columns=vect.get_feature_names(), index=text_list.index)

In [22]:
train_bow = train.copy()
bag_of_words(train_bow['comment_text'])

Unnamed: 0,ab,abacha,abandonment,abbasid,abbassid,abbey,abercrombie,ability,able,abolishing,...,zimbeck,zimbeckchess,zimeckchess,zob,zoe,zordanlighter,zorro,zuck,zuckerberg,zuckerbergs
155855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55648,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41707,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
79201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
143806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Bi-gram
Using the bigram approach. For example: 'The quick brown fox' becomes 'The quick','quick brown','brown fox'

In [14]:
from nltk.util import ngrams
train_bigram = train.copy()
train_bigram['comment_text'] = [list(ngrams(x,2)) for x in train['comment_text']]

In [16]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
39235,68b26335a615c0cb,"[(thanks, experimenting), (experimenting, wiki...",0,0,0,0,0,0
113666,5fd20306a7e18a43,"[(weaseling, pov), (pov, pushing), (pushing, n...",0,0,0,0,0,0
46227,7b7c111983ab5731,"[(open, letter), (letter, tibet), (tibet, open...",0,0,0,0,0,0
158095,e8a92c1dd34d70ca,"[(sorry, bother), (bother, third), (third, tim...",0,0,0,0,0,0
81613,da4a0309ba33e165,"[(cluttering, article), (article, card), (card...",0,0,0,0,0,0
30314,50800b2b7c72302b,"[(people, please), (please, quit), (quit, addi...",0,0,0,0,0,0
127645,aabb67f3349f80ca,"[(february, please), (please, stop), (stop, co...",0,0,0,0,0,0
8567,16bbf205daf2ecfb,"[(sigh, know), (know, people), (people, want),...",0,0,0,0,0,0
78161,d1381e3a39040c4b,"[(suddenly, multiculturalist), (multiculturali...",0,0,0,0,0,0
120133,8279457e74a5dc50,"[(remove, edit), (edit, user), (user, page), (...",0,0,0,0,0,0


## Prediction

### Random Forest
Run the bag of words model for the original, stemmed, and lemmatized datasets using a random forest to classify toxicity

In [12]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [13]:
for df in [train,train_stemmed,train_lemmatized]:
    X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words(df['comment_text']).as_matrix(), df.drop(['id','comment_text'],axis=1), random_state=0, test_size=.2)
    rf = RandomForestClassifier()
    rf.fit(X_train,Y_train)
    print('+'*20)
    print('Model:', df.name)
    print('Train accuracy:', rf.score(X_train,Y_train))
    print('Test accuracy:', rf.score(X_test,Y_test))

++++++++++++++++++++
Model: train
Train accuracy: 0.97875
Test accuracy: 0.9
++++++++++++++++++++
Model: train_stemmed
Train accuracy: 0.97875
Test accuracy: 0.9
++++++++++++++++++++
Model: train_lemmatized
Train accuracy: 0.97875
Test accuracy: 0.9


In [35]:
import ast
# get prediction for original values
train_og = pd.read_csv('./Train_Original.csv',index_col=0)#.sample(1000)
test_og = pd.read_csv('./Test_Original.csv',index_col=0)#.sample(1000)

In [36]:
train_og['comment_text'] = [ast.literal_eval(x) for x in train_og['comment_text']]
test_og['comment_text'] = [ast.literal_eval(x) for x in test_og['comment_text']]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words(train_og['comment_text']).as_matrix(), train_og.drop(['id','comment_text'],axis=1), random_state=0, test_size=.2)
rf = RandomForestClassifier()

In [9]:
rf.fit(X_train,Y_train)
print('+'*20)
print('Model:', df.name)
print('Train accuracy:', rf.score(X_train,Y_train))
print('Test accuracy:', rf.score(X_test,Y_test))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"['explanation', 'edits', 'made', 'username', '...",0,0,0,0,0,0
1,000103f0d9cfb60f,"['aww', 'matches', 'background', 'colour', 'se...",0,0,0,0,0,0
2,000113f07ec002fd,"['hey', 'man', 'really', 'trying', 'edit', 'wa...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"['make', 'real', 'suggestions', 'improvement',...",0,0,0,0,0,0
4,0001d958c54c6e35,"['sir', 'hero', 'chance', 'remember', 'page']",0,0,0,0,0,0
5,00025465d4725e87,"['congratulations', 'well', 'use', 'tools', 'w...",0,0,0,0,0,0
6,0002bcb3da6cb337,"['cocksucker', 'piss', 'around', 'work']",1,1,1,0,1,0
7,00031b1e95af7921,"['vandalism', 'matt', 'shirvington', 'article'...",0,0,0,0,0,0
8,00037261f536c51d,"['sorry', 'word', 'nonsense', 'offensive', 'an...",0,0,0,0,0,0
9,00040093b2687caa,"['alignment', 'subject', 'contrary', 'dulithgow']",0,0,0,0,0,0


In [21]:
train_og['comment_text'][0]

"['explanation', 'edits', 'made', 'username', 'hardcore', 'metallica', 'fan', 'reverted', 'vandalisms', 'closure', 'gas', 'voted', 'new', 'york', 'dolls', 'fac', 'please', 'remove', 'template', 'talk', 'page', 'since', 'retired']"

In [26]:
# send original to csv
train.to_csv('Train_Original.csv')
test.to_csv('Test_Original.csv')

In [1]:
# send stemmed to csv
train_stemmed.to_csv('Train_Stemmed.csv')
test_stemmed.to_csv('Test_Stemmed.csv')

NameError: name 'train_stemmed' is not defined

In [None]:
# send lemmatized to csv
train_lemmatized.to_csv('Train_Lemmatized.csv')
test_lemmatized.to_csv('Test_Lemmatized.csv')