In [2]:
# output all code in a chunk
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# importing required libraries and functions

# data exploration
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualizations
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
import string

# text mining
import re # regular expression
from nltk import word_tokenize, PorterStemmer # natural language toolkit
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
## from sklearn.decomposition import TruncatedSVD

# modeling building
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics

# download nltk packages
# nltk.download()

# working directory
import os


In [5]:
# reading data
train = pd.read_csv("train.csv", nrows = 10000)
test = pd.read_csv("test.csv", nrows = 10000)

In [6]:
# verifying data
train.comment_text.head()
test.comment_text.head()
len(train)
len(test)

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

0    Yo bitch Ja Rule is more succesful then you'll...
1    == From RfC == \n\n The title is fine as it is...
2    " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3    :If you have a look back at the source, the in...
4            I don't anonymously edit articles at all.
Name: comment_text, dtype: object

10000

10000

In [7]:
# creating train-validation split
X_train, X_val, y_train, y_val = train_test_split(train.comment_text, train.iloc[:,2:8], test_size=0.3, random_state=19)
X_test = test.comment_text

In [8]:
# creating function to normalize text
def normalize(text):
    # recognizing new line characters and tab spaces and substituting it with space
    norm_text = re.sub(r'\n|\t', ' ', text)
    # recognizing time values
    norm_text = re.sub(r'[0-9]{1,2}:[0-9][0-9]', 'time_value', norm_text) # example 5:13pm and 05:13pm
    # recognizing date values
    norm_text = re.sub(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}', 'date_value', norm_text) # example 2018-03/05 and 04/03-2018
    norm_text = re.sub(r'[0-9]{1,4}[ ,][A-Za-z]{3,10}[ ,][0-9]{1,4}', 'date_value', norm_text) # example 9 june 2009 and 9 June 2009
    # substitute characters not required by nothing, removing unrequired characters
    norm_text = re.sub(r'[^A-Za-z_ ]', ' ', norm_text)
    # removing multiple space values
    norm_text = re.sub(r' +', ' ', norm_text)
    # removing trailing spaces from front and back and converting all text to lowercase
    norm_text = norm_text.strip().lower()
    return norm_text

In [9]:
# creating stemmer object of PorterStemmer function
stemmer = PorterStemmer()

# writing stem_tokens function to perform stemming on tokens
def stem_tokens(tokens, stemmer): # tokens example: ['today', 'is', 'a', 'good', 'day']
    stemmed = [stemmer.stem(word) for word in tokens]
    return stemmed

In [10]:
# processing text as follows
# tokenize words in each comment
# remove stopwords or words upto lenght of 3 characters
# stem words using the stem_tokens function we created above
def text_process(text): # text is a single sentence; for example: 'today is a good day'
    temp_tokens = word_tokenize(text)

    # using alternative to removing stopwords of english
    ## tokens = [word for word in temp_tokens if len(word) > 3]
    
    # removing english stopwords, code was commented to save computation time
    nostop_tokens = [word for word in temp_tokens if word not in stopwords.words('english')]
    
    stems = stem_tokens(nostop_tokens, stemmer)
    return ' '.join(stems)

In [11]:
# lenght of stopword of english
len(stopwords.words('english'))
stopwords.words('english')[:10]

179

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [12]:
# preparing training text to pass in count vectorizer
corpus = []
for text in X_train:
    text = normalize(text)
    text = text_process(text)
    corpus.append(text)

In [13]:
# build Count Vectorizer, to convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer(ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(corpus)

In [14]:
# build TFIDF Transformer, to transform a count matrix to a normalized tf or tf-idf representation
# tfidf - term frequency inverse document frequency
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [15]:
# verifing data
# print(X_train_counts.toarray())

In [16]:
# verifing data
# print(X_train_tfidf.toarray())

In [17]:
# checking how much text is transformed
temp = pd.DataFrame({'Before': X_train, 'After': corpus})
print(temp.sample(10))

                                                  After  \
8098  edit anthoni hungerford dear phillip comment r...   
3382  welcom wikipedia pleas stop insert unsourc lib...   
5286                ye red link meet mo dabrl see chang   
3693  hello submiss utrecht te deum jubil know nomin...   
9616  song adapt worth song iron maiden movi heart d...   
1818  lol lol serious bryanfrompalatin ip resolv col...   
3687  thank welcom thank tom warm welcom see path cr...   
3791  thank moonriddengirl wikidea sure excel lawyer...   
6176  also tag page vkurka cur prev time_valu date_v...   
925   wikipedia full fool take money make peopl work...   

                                                 Before  
8098  "\n\n Editing of Anthony Hungerford \n\nDear P...  
3382  Welcome to Wikipedia. Please stop inserting un...  
5286  Yes, it was a red link which didn't meet MOS:D...  
3693  Hello! Your submission of Utrecht Te Deum and ...  
9616  "\n\nsong adaptation\nfor what it worthes ther...  
18

In [18]:
# preparing validation text to pass in count vectorizer
X_val_set = []
for text in X_val:
    text = normalize(text)
    text = text_process(text)
    X_val_set.append(text)

# tranforming validation data using count vectorizer followed by tfidf transformer
X_val_counts = count_vect.transform(X_val_set)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)

In [19]:
# preparing test text to pass in count vectorizer
X_test_set = []
for text in X_test:
    text = normalize(text)
    text = text_process(text)
    X_test_set.append(text)

# tranforming validation data using count vectorizer followed by tfidf transformer
X_test_counts = count_vect.transform(X_test_set)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [20]:
# Singular Value Decomposition
# Commented as using SVD decreased the estimation score
# build Truncated SVD to reduce the dimensionality

## svd=TruncatedSVD(n_components=100)
## X_train_svd = svd.fit_transform(X_train_tfidf)
## X_val_svd = svd.transform(X_val_tfidf)
## X_test_svd = svd.transform(X_test_tfidf)

In [21]:
# creating dictionary to store prediction results
result_test = dict()
result_val = dict()

In [22]:
# Multinomial Naive Bayes Model
MNB_classifier = OneVsRestClassifier(MultinomialNB())
grid_values = {'estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10, 100]}
MNB_model = GridSearchCV(MNB_classifier, param_grid = grid_values, scoring = 'roc_auc')
MNB_model.fit(X_train_tfidf, y_train)
print('Accurary of Multinomial Naive Bayes Classifier on Training Data: {:.3f}' .format(MNB_model.score(X_train_tfidf, y_train)))
print('Accurary of Multinomial Naive Bayes Classifier on Validation Data: {:.3f}' .format(MNB_model.score(X_val_tfidf, y_val)))
print('Grid best parameter (max. accuracy): ', MNB_model.best_params_)
print('Grid best score (accuracy): ', MNB_model.best_score_)
result_test['Multinomial_NB'] = MNB_model.predict(X_test_tfidf)
result_val['Multinomial_NB'] = MNB_model.predict(X_val_tfidf)

GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Accurary of Multinomial Naive Bayes Classifier on Training Data: 1.000
Accurary of Multinomial Naive Bayes Classifier on Validation Data: 0.825
Grid best parameter (max. accuracy):  {'estimator__alpha': 0.1}
Grid best score (accuracy):  0.848985648234


In [23]:
# Multinomial_NB result summary
result_test['Multinomial_NB'].sum(axis=0)

array([376,   0, 133,   0,  53,   0])

In [24]:
# Bernoulli Naive Bayes Model
BNB_classifier = OneVsRestClassifier(BernoulliNB())
grid_values = {'estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10, 100]}
BNB_model = GridSearchCV(BNB_classifier, param_grid = grid_values, scoring = 'roc_auc')
BNB_model.fit(X_train_tfidf, y_train)
print('Accurary of Bernoulli Naive Bayes Classifier on Training Data: {:.3f}' .format(BNB_model.score(X_train_tfidf, y_train)))
print('Accurary of Bernoulli Naive Bayes Classifier on Validation Data: {:.3f}' .format(BNB_model.score(X_val_tfidf, y_val)))
print('Grid best parameter (max. accuracy): ', BNB_model.best_params_)
print('Grid best score (accuracy): ', BNB_model.best_score_)
result_test['Bernoulli_NB'] = BNB_model.predict(X_test_tfidf)
result_val['Bernoulli_NB'] = BNB_model.predict(X_val_tfidf)

GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Accurary of Bernoulli Naive Bayes Classifier on Training Data: 1.000
Accurary of Bernoulli Naive Bayes Classifier on Validation Data: 0.814
Grid best parameter (max. accuracy):  {'estimator__alpha': 0.001}
Grid best score (accuracy):  0.815762586966


In [25]:
# Bernoulli_NB result summary
result_test['Bernoulli_NB'].sum(axis=0)

array([3770, 2457, 3265, 2089, 3453, 2447])

In [26]:
# Logistic Regression Model
log_model = OneVsRestClassifier(LogisticRegression())
#log_model.get_params().keys()
grid_values = {'estimator__C': [0.3, 1.0, 30.0]}
log_grid = GridSearchCV(log_model, param_grid = grid_values, scoring = 'roc_auc')
log_grid.fit(X_train_tfidf, y_train)
print('Accurary of Logistic Regression Classifier on Training Data: {:.3f}' .format(log_grid.score(X_train_tfidf, y_train)))
print('Accurary of Logistic Regression Classifier on Validation Data: {:.3f}' .format(log_grid.score(X_val_tfidf, y_val)))
print('Grid best parameter (max. accuracy): ', log_grid.best_params_)
print('Grid best score (accuracy): ', log_grid.best_score_)
result_test['Logistic_Regression'] = log_grid.predict(X_test_tfidf)
result_val['Logistic_Regression'] = log_grid.predict(X_val_tfidf)

GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__C': [0.3, 1.0, 30.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Accurary of Logistic Regression Classifier on Training Data: 1.000
Accurary of Logistic Regression Classifier on Validation Data: 0.971
Grid best parameter (max. accuracy):  {'estimator__C': 30.0}
Grid best score (accuracy):  0.953124843436


In [27]:
# Logistic_Regression result summary
result_test['Logistic_Regression'].sum(axis=0)

array([1269,   83,  712,    1,  663,   59])

In [28]:
# SVM Classifier Model
grid_values = {'estimator__C': [0.3, 1.0, 30.0]}
svm_model = OneVsRestClassifier(SVC(kernel = 'linear'))
svm_grid = GridSearchCV(svm_model, param_grid = grid_values, scoring = 'roc_auc')
svm_grid.fit(X_train_tfidf, y_train)
print('Accurary of SVM Classifier on Training Data: {:.3f}' .format(svm_grid.score(X_train_tfidf, y_train)))
print('Accurary of SVM Classifier on Validation Data: {:.3f}' .format(svm_grid.score(X_val_tfidf, y_val)))
print('Grid best parameter (max. accuracy): ', svm_grid.best_params_)
print('Grid best score (accuracy): ', svm_grid.best_score_)
result_test['SVM_Classifier'] = svm_grid.predict(X_test_tfidf)
result_val['SVM_Classifier'] = svm_grid.predict(X_val_tfidf)

GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'estimator__C': [0.3, 1.0, 30.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Accurary of SVM Classifier on Training Data: 1.000
Accurary of SVM Classifier on Validation Data: 0.953
Grid best parameter (max. accuracy):  {'estimator__C': 0.3}
Grid best score (accuracy):  0.939056090018


In [29]:
# SVM_Classifier result summary
result_test['SVM_Classifier'].sum(axis=0)

array([592,   0, 455,   0, 304,   0])

In [30]:
# how many positive cases, i.e toxic cases we recognized for each model?
print('Number of Toxic Cases using Multinomial Naive Bayes Model: {:.2f}' .format(result_test['Multinomial_NB'].sum()))
print('Number of Toxic Cases using Bernoulli Naive Bayes Model: {:.2f}' .format(result_test['Bernoulli_NB'].sum()))
print('Number of Toxic Cases using Logistic Regression Classifier Model: {:.2f}' .format(result_test['Logistic_Regression'].sum()))
print('Number of Toxic Cases using SVM Classifier Model: {:.2f}' .format(result_test['SVM_Classifier'].sum()))

# predicted for how many comments?
print('\nTotal Number of Comments for which we made Predictions: {:.2f}' .format(len(X_test)))

# number of positive cases in training data and length of training data, includes validation data
print('\nTotal Number of Positive Cases in Training Data (Training + Validation): {:.2f}' .format(train.iloc[:,2:8].sum(axis=0).sum()))
print('Total Number of Comments in Training Data (Training + Validation): {:.2f}' .format(len(X_train)+len(X_val)))

# number of predicted positive cases in training data using SVM model
print('Number of Toxic Cases using SVM Classifier Model: {:.2f}' .format(svm_grid.predict(X_train_tfidf).sum()))

Number of Toxic Cases using Multinomial Naive Bayes Model: 562.00
Number of Toxic Cases using Bernoulli Naive Bayes Model: 17481.00
Number of Toxic Cases using Logistic Regression Classifier Model: 2787.00
Number of Toxic Cases using SVM Classifier Model: 1351.00

Total Number of Comments for which we made Predictions: 10000.00

Total Number of Positive Cases in Training Data (Training + Validation): 2210.00
Total Number of Comments in Training Data (Training + Validation): 10000.00
Number of Toxic Cases using SVM Classifier Model: 406.00


In [31]:
# storing results of SVM Classifier as our result
y_test = result_test['SVM_Classifier']
type(y_test)

numpy.ndarray

In [32]:
# combining final results with the original test data set
output = pd.DataFrame(y_test, columns = train.columns[2:8], index = test.index)
output = pd.concat([test, output], axis=1)

output.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,1,0,0,0,0,0
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,0,0,0,0,0
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,0,0,0,0,0
3,00017563c3f7919a,":If you have a look back at the source, the in...",0,0,0,0,0,0
4,00017695ad8997eb,I don't anonymously edit articles at all.,0,0,0,0,0,0


In [33]:
# verifing data
X_train_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# verifing data
X_train_tfidf.toarray()

MemoryError: 

In [None]:
# verifing data
output.sample(20)

In [None]:
# verifing select random case, as per index from above code chunk
output.iloc[5902,:]
output.comment_text[5902]

In [None]:
# verifing data
y_train.head()

In [None]:
# quick summary for training, validation and test set respectively
# this shows the balance in results using SVM model in comparison to training data
y_train.sum(axis=0)
y_val.sum(axis=0)
output.iloc[:,2:8].sum(axis=0)

In [None]:
# precision recall f1-score report
print(metrics.classification_report(y_val.toxic, result_val['Multinomial_NB'][:,1], target_names = ["positive", "negative"]))
print(metrics.classification_report(y_val.toxic, result_val['Bernoulli_NB'][:,1], target_names = ["positive", "negative"]))
print(metrics.classification_report(y_val.toxic, result_val['Logistic_Regression'][:,1], target_names = ["positive", "negative"]))
print(metrics.classification_report(y_val.toxic, result_val['SVM_Classifier'][:,1], target_names = ["positive", "negative"]))

In [None]:
# creating a object with final output of predictions on the test data set
final_output = output.drop(['comment_text'], axis = 1, inplace = False)

In [None]:
# writing the output object to a csv file
final_output.to_csv('submission_project.csv', index=False)