# Toxic Comment Classification 
### Identify and classify toxic online comments with N-Gram-based text classification  
This part of the notebook is for data preparation, mainly tokenizing and stemming words, removing stop words.   

### Read in data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('train.csv').fillna(' ')

In [3]:
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
print(df.shape)

(159571, 8)


In [5]:
unique, counts = np.unique(df['toxic'], return_counts=True) 
print("The frequency of instances per class toxic is: " , dict(zip(unique, counts)))
unique, counts = np.unique(df['severe_toxic'], return_counts=True) 
print("The frequency of instances per class severe_toxic is: " , dict(zip(unique, counts)))
unique, counts = np.unique(df['obscene'], return_counts=True) 
print("The frequency of instances per class obscene is: " , dict(zip(unique, counts)))
unique, counts = np.unique(df['threat'], return_counts=True) 
print("The frequency of instances per class threat is: " , dict(zip(unique, counts)))
unique, counts = np.unique(df['insult'], return_counts=True) 
print("The frequency of instances per class insult is: " , dict(zip(unique, counts)))
unique, counts = np.unique(df['identity_hate'], return_counts=True) 
print("The frequency of instances per class identity_hate is: " , dict(zip(unique, counts)))

The frequency of instances per class toxic is:  {0: 144277, 1: 15294}
The frequency of instances per class severe_toxic is:  {0: 157976, 1: 1595}
The frequency of instances per class obscene is:  {0: 151122, 1: 8449}
The frequency of instances per class threat is:  {0: 159093, 1: 478}
The frequency of instances per class insult is:  {0: 151694, 1: 7877}
The frequency of instances per class identity_hate is:  {0: 158166, 1: 1405}


### Clean data

In [6]:
df = df.replace('\n','',regex=True)

In [7]:
df = df.replace(',','',regex=True)
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,ExplanationWhy the edits made under my usernam...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,Hey man I'm really not trying to edit war. It'...,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""MoreI can't make any real suggestions on impr...",0,0,0,0,0,0
4,0001d958c54c6e35,You sir are my hero. Any chance you remember w...,0,0,0,0,0,0
5,00025465d4725e87,"""Congratulations from me as well use the tools...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


### Tokenize data

In [8]:
import nltk
# nltk.download()
# data like stopwords need to be downloaded

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
df["Tokens"] = df["comment_text"].str.lower().apply(tokenizer.tokenize)

In [11]:
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,Tokens
0,0000997932d777bf,ExplanationWhy the edits made under my usernam...,0,0,0,0,0,0,"[explanationwhy, the, edits, made, under, my, ..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour..."
2,000113f07ec002fd,Hey man I'm really not trying to edit war. It'...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit..."
3,0001b41b1c6bb37e,"""MoreI can't make any real suggestions on impr...",0,0,0,0,0,0,"[morei, can, t, make, any, real, suggestions, ..."
4,0001d958c54c6e35,You sir are my hero. Any chance you remember w...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re..."
5,00025465d4725e87,"""Congratulations from me as well use the tools...",0,0,0,0,0,0,"[congratulations, from, me, as, well, use, the..."
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,"[cocksucker, before, you, piss, around, on, my..."
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,"[your, vandalism, to, the, matt, shirvington, ..."
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,"[sorry, if, the, word, nonsense, was, offensiv..."
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,"[alignment, on, this, subject, and, which, are..."


### Get rid of stop words

In [12]:
from nltk.corpus import stopwords
stopset = set(stopwords.words("english"))
df["Tokens"] = df["Tokens"].apply(lambda x: [item for item in x if item not in stopset])
# https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe

In [13]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,Tokens
0,0000997932d777bf,ExplanationWhy the edits made under my usernam...,0,0,0,0,0,0,"[explanationwhy, edits, made, username, hardco..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ..."
2,000113f07ec002fd,Hey man I'm really not trying to edit war. It'...,0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con..."
3,0001b41b1c6bb37e,"""MoreI can't make any real suggestions on impr...",0,0,0,0,0,0,"[morei, make, real, suggestions, improvement, ..."
4,0001d958c54c6e35,You sir are my hero. Any chance you remember w...,0,0,0,0,0,0,"[sir, hero, chance, remember, page]"


### Stem and lemmatize Tokens

In [14]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
df['Stemmed'] = df["Tokens"].apply(lambda x: [stemmer.stem(item) for item in x])

In [15]:
df = df.drop(['comment_text', 'Tokens'], axis=1)

In [16]:
df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,Stemmed
0,0000997932d777bf,0,0,0,0,0,0,"[explanationwhi, edit, made, usernam, hardcor,..."
1,000103f0d9cfb60f,0,0,0,0,0,0,"[aww, match, background, colour, seem, stuck, ..."
2,000113f07ec002fd,0,0,0,0,0,0,"[hey, man, realli, tri, edit, war, guy, consta..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,"[morei, make, real, suggest, improv, wonder, s..."
4,0001d958c54c6e35,0,0,0,0,0,0,"[sir, hero, chanc, rememb, page]"


In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['Stemmed'] = df["Stemmed"].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])

In [18]:
df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,Stemmed
0,0000997932d777bf,0,0,0,0,0,0,"[explanationwhi, edit, made, usernam, hardcor,..."
1,000103f0d9cfb60f,0,0,0,0,0,0,"[aww, match, background, colour, seem, stuck, ..."
2,000113f07ec002fd,0,0,0,0,0,0,"[hey, man, realli, tri, edit, war, guy, consta..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,"[morei, make, real, suggest, improv, wonder, s..."
4,0001d958c54c6e35,0,0,0,0,0,0,"[sir, hero, chanc, rememb, page]"


In [19]:
# df.to_csv("df.csv")

From here we nevigate to **Spark** and finish the modeling with SparkML

## Read the data, and re-tokenize them

In [None]:
pyspark
# in AWS pyspark-------------------------------------------------------------
df_path = "~/df.csv"
df = spark.read.load(df_path, format="com.databricks.spark.csv", 
	header = True, inferschema = True, multiLine = True)
# df.printSchema()

## Vectorize----------------------------------------------------------------
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  return c

clean_text_df = df.select("id",clean_text(col("Stemmed")).alias("text"),
    "toxic","severe_toxic","obscene","threat","insult","identity_hate")

# Tokenize
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="vector")
vector_df = tokenizer.transform(clean_text_df)


## Tf-Idf with N-gram

In [None]:
## N-gram & TF----------------------------------------------------------------
from pyspark.ml.feature import NGram, VectorAssembler, HashingTF # CountVectorizer requires more ram
from pyspark.ml import Pipeline

# use this function to assemble 1-gram, 2-gram, and 3-gram together
## https://stackoverflow.com/questions/38839924/
def build_ngrams(inputCol="vector", n=3):
    ngrams = [
        NGram(n=i, inputCol="vector", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
    vectorizers = [
        HashingTF(inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i), numFeatures=20000) #10000,30000,20000
        for i in range(1, n + 1)
    ]
    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    return Pipeline(stages=ngrams + vectorizers + assembler)

ngram_df = build_ngrams().fit(vector_df).transform(vector_df) 

# Train test Split
combined = ngram_df.select("features","toxic",
    "severe_toxic","obscene","threat","insult","identity_hate")
train, test = combined.randomSplit([0.8,0.2])
# train.cache()  # 127812
test.cache() # 31759

# classifier "toxic"----------------------------------------------------------------
## Sampling-----------------------------------------------------------
train.select("toxic").groupBy("toxic").count().show()
# sample
Train_toxic = train.sampleBy("toxic", fractions={0: 0.2, 1: 1.0})
Train_toxic.select("toxic").groupBy("toxic").count().show()

# IDF----------------------------------------------------------------
from pyspark.ml.feature import IDF
# Get rid of words appear only once in the whole data set
idfIgnore = IDF(minDocFreq=2, inputCol ='features',outputCol = 'idf_features').fit(train)

# transform train and test here in order to save time in validation
idf_train = idfIgnore.transform(Train_toxic).cache()
idf_test = idfIgnore.transform(test).cache()

## Try and pick models

In [None]:
# Build a evaluator to use in model-picking-----------------------------
from pyspark.ml.evaluation import BinaryClassificationEvaluator
Evaluator = BinaryClassificationEvaluator(labelCol = "toxic")

# NaiveBayes------------------------------------------------------------
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="toxic",featuresCol ="idf_features", modelType="multinomial")

# use train-validation split to select parameters, cv is too time-consuming
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
paramGrid_nb = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]).build()
tvs_nb= TrainValidationSplit(estimator=nb, estimatorParamMaps=paramGrid_nb, evaluator=Evaluator, trainRatio=0.7)
tvsModel_nb = tvs_nb.fit(idf_train)
# use test data to see performance
tvsPredictions_nb = tvsModel_nb.transform(idf_test)
Evaluator.evaluate(tvsPredictions_nb) 
# see predictions
tvsPredictions_nb.show(5)

# DecisionTree----------------------------------------------------------
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="toxic",featuresCol ="idf_features")
# use train-validation split to select parameters
paramGrid_dt = ParamGridBuilder().addGrid(dt.maxDepth, [5,10,15,20,25,30]).addGrid(dt.impurity, ['entropy', 'gini']).build()
tvs_dt = TrainValidationSplit(estimator=dt, estimatorParamMaps=paramGrid_dt, evaluator=Evaluator, trainRatio=0.7)
tvsModel_dt = tvs_dt.fit(idf_train)
# use test data to see performance
tvsPredictions_dt = tvsModel_dt.transform(idf_test) 
Evaluator.evaluate(tvsPredictions_nb)
# see model detial
tvsModel_dt.bestModel

# LogisticRegression---------------------------------------------
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="toxic",featuresCol ="idf_features")
paramGrid_lr = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.000001, 0.00001,0.0001,0.001,0.01, 0.5])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build())
# use train-validation split to select parameters
tvs_lr = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid_lr, evaluator=Evaluator, trainRatio=0.7)
tvsModel_lr = tvs_lr.fit(idf_train)
# use test data to see performance
tvsPredictions_lr = tvsModel_lr.transform(idf_test)
Evaluator.evaluate(tvsPredictions_lr) 
# see model detial
tvsModel_lr.bestModel._java_obj.getRegParam() 
tvsModel_lr.bestModel._java_obj.getElasticNetParam() 

# RandomForest---------------------------------------------------
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="toxic",featuresCol ="idf_features", 
    maxBins=32, minInstancesPerNode=20, impurity ='gini', featureSubsetStrategy="auto", subsamplingRate=0.7)
paramGrid_rf = (ParamGridBuilder()
    .addGrid(rf.impurity, ['entropy', 'gini'])
    .addGrid(rf.numTrees, [10, 20, 30])
    .build())
# use train-validation split to select parameters
tvs_rf = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid_rf, evaluator=Evaluator, trainRatio=0.7)
tvsModel_rf = tvs_rf.fit(idf_train)
# use test data to see performance
tvsPredictions_rf = tvsModel_rf.transform(idf_test)
# see model detial
Evaluator.evaluate(tvsPredictions_rf) # 
tvsModel_rf.bestModel
tvsModel_rf.bestModel._java_obj.getImpurity() 
