# Reading data ( Stage 1 )

In [1]:
# Reading data
import numpy as np
import pandas as pd
import scipy.io
import classifiers as clf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os

In [2]:
current_dir = os.getcwd()
df = pd.read_csv(current_dir + '/data/toxic/train.csv')
print(df.info())

# Spliting data to obtain comments and labels

comments = df['comment_text']
toxic_label = df['toxic']
toxic_labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
print('\n\nComments size: ', comments.shape, "\t", "Labels size: ", toxic_label.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None


Comments size:  (159571,) 	 Labels size:  (159571,)


# Preprocessing data ( Stage 2 )

### Preprocessing functions

In [3]:
import nltk
# Downloading componnents of nltk (execute just one time nltk.download())
# nltk.download()

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
import string 
import pandas as pd 
from nltk import pos_tag 
from nltk.stem import PorterStemmer


def remove_punctuation(text):
    # Removing standar punctuation (replacing with blank "" spaces)
    return " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

def lower_case_tokens(tokens):
    return [word.lower() for word in tokens]

def remove_stopwords(tokens):
    stopwds = stopwords.words('english')
    return [token for token in tokens if token not in stopwds]


def remove_short_length_words(tokens):
    # Removing words which length is lower than 3 (do not apport much of a meaning)
    return [word for word in tokens if len(word) >= 3]


def stem(tokens):
    # Using PorterStemmer to stem suffixes in words
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]


def lemmatize(tokens):
    # Tagging the words
    # “NN (noun, common, singular), NNP (noun, proper, singular), 
    # NNPS (noun, proper, plural), NNS (noun, common, plural), 
    # VB (verb, base form), VBD (verb, past tense), 
    # VBG (verb, present participle), VBN (verb, past participle), 
    # VBP (verb, present tense, not third person singular), 
    # VBZ (verb, present tense, third person singular)”
    tagged_corpus = pos_tag(tokens)
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    
    # Lemmatizing model
    lemmatizer = WordNetLemmatizer()
    
    # Validating tags and lemmatizing accordingly
    def prat_lemmatize(token,tag):
        # Nouns
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        # Verbs
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        # Any other
        else:
            return lemmatizer.lemmatize(token,'n')
    
    return [prat_lemmatize(token,tag) for token,tag in tagged_corpus] 


def get_raw_tokens(text):
    # Tokenizing the text into words (based on white spaces to build the list)
    return [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]


def get_processed_tokens(text):
    text = remove_punctuation(text)
    tokens = get_raw_tokens(text)
    tokens = lower_case_tokens(tokens)
    tokens = remove_stopwords(tokens)
    tokens = remove_short_length_words(tokens)
    tokens = stem(tokens)
    return tokens


def preprocessing(text):
    tokens = get_processed_tokens(text)
    
    # Reconstructing text
    return " ".join(lemmatize(tokens))

### Convert text to raw/processed tokens (used to for word2vec later)

In [4]:
import gensim
from tqdm import tqdm
import pickle
import sys

# Increasing depth in recursion limit
sys.setrecursionlimit(5000)

# directories
cwd = os.getcwd()
tokens_dir = cwd + "/tokens/toxic/"
raw_tokens_dir = tokens_dir + "raw.pickle"
processed_tokens_dir = tokens_dir + "processed.pickle"

In [5]:
# Check if token picke files exist
if((not os.path.exists(raw_tokens_dir)) or (not os.path.exists(processed_tokens_dir))):
    
    # Initialising array to storage tokens
    processed_tokens = []
    raw_tokens = []

    i = 0
    # Pre-processing
    for line in tqdm(iter(comments), total=len(comments), unit="comments"):
        i+=1;
        raw_tokens.append(get_raw_tokens(line))
        processed_tokens.append(get_processed_tokens(line))

    # Save tokens
    raw_tokens_pickle = open(raw_tokens_dir, "wb")
    processed_tokens_pickle = open(processed_tokens_dir, "wb")

    pickle.dump(raw_tokens, raw_tokens_pickle)
    pickle.dump(processed_tokens, processed_tokens_pickle)

    raw_tokens_pickle.close()
    processed_tokens_pickle.close()

### Set up word2vec & GloVe

In [6]:
import urllib
import os
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import zipfile
import gensim
from tqdm import tqdm
import pickle
import sys

# directories
cwd = os.getcwd()

glove_folder_dir = cwd + "/glove";
glove_input_file = glove_folder_dir + '/glove.6B.100d.txt'
word2vec_output_file = glove_folder_dir + '/glove.6B.100d.txt.word2vec'


### source: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/


### download glove if not exists
if(not os.path.exists(cwd+"/glove.zip")):
    urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", cwd+"/glove.zip")
    zip_ref = zipfile.ZipFile(cwd + "/glove.zip", 'r')
    zip_ref.extractall(glove_folder_dir)
    zip_ref.close()
    
# Convert glove embedding to word2vec
if(not os.path.exists(word2vec_output_file)):
    glove2word2vec(glove_input_file, word2vec_output_file)

In [7]:
word2vec_folder_dir = cwd + "/word2vec-gensim/toxic/";

if not os.path.exists(word2vec_folder_dir):
    os.makedirs(word2vec_folder_dir)
    
w2c_glove_file = word2vec_folder_dir + "glove.pickle"
w2c_processed_file = word2vec_folder_dir + "processed.pickle"
w2c_raw_file = word2vec_folder_dir + "raw.pickle"

In [8]:
# check glove's word2vec file exists 

if(not os.path.exists(w2c_glove_file)):
    
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    w2v_glove = dict(zip(glove_model.wv.index2word, glove_model.wv.syn0))

    # Saving the glove model
    toxic_w2v_glove_pickle = open(w2c_glove_file,"wb")
    pickle.dump(w2v_glove, toxic_w2v_glove_pickle)
    toxic_w2v_glove_pickle.close()

In [9]:
# check word2vec (processed) files exist

if(not os.path.exists(w2c_processed_file)):
    
    # get processed tokens
    toxic_processed_tokens_pickle = open(processed_tokens_dir, "rb")
    
    processed_tokens = pickle.load(toxic_processed_tokens_pickle)
    
    # Gen the word2vec model of processed tokens
    processed_model = gensim.models.Word2Vec(processed_tokens, size=15)
    w2v_processed = dict(zip(processed_model.wv.index2word, processed_model.wv.syn0))
    
    # Saving the word2vec model of processed tokens
    toxic_processed_w2v_pickle = open(w2c_processed_file, "wb")
    pickle.dump(w2v_processed, toxic_processed_w2v_pickle)

    # close pickle connections
    toxic_processed_w2v_pickle.close()
    toxic_processed_tokens_pickle.close()

In [10]:
# check word2vec (raw) files exist

if(not os.path.exists(w2c_raw_file)):
    
    # get raw tokens
    toxic_raw_tokens_pickle = open(raw_tokens_dir, "rb")
    raw_tokens = pickle.load(toxic_raw_tokens_pickle)
    
    
    # Gen the word2vec model of raw tokens
    raw_model = gensim.models.Word2Vec(raw_tokens, size=15)
    w2v_raw = dict(zip(raw_model.wv.index2word, raw_model.wv.syn0))
    
    
    # Saving the word2vec model of raw tokens
    toxic_raw_w2v_pickle = open(w2c_raw_file,"wb")
    pickle.dump(w2v_raw, toxic_raw_w2v_pickle)
    
    # close pickle connections
    toxic_raw_w2v_pickle.close()
    toxic_raw_tokens_pickle.close()

#### Preprocess text for tfid

In [11]:
from tqdm import tqdm
import pickle
import sys

if(not os.path.exists(cwd + "/toxic_preprocessed.pickle")):
    # Increasing depth in recursion limit
    sys.setrecursionlimit(5000)

    # Initialising array to storage preprocessed data
    preprocessed_data = []

    i = 0
    # Pre-processing
    for line in tqdm(comments):
        i = i+1
        preprocessed_data.append(preprocessing(line))

    # Saving the preprocessed data
    pickle_out = open(cwd + "/toxic_preprocessed.pickle","wb")
    pickle.dump(preprocessed_data, pickle_out)
    pickle_out.close()

# Splitting in training and test set ( Stage 3 )

In [12]:
import pickle

# Importing preprocessed text data
pickle_in = open("toxic_preprocessed.pickle","rb")
# preprocessed_data = pickle.load(pickle_in)
preprocessed_data = comments

# Splitting into train and test set (75% train - 25% set)
train_size = int( round( len(preprocessed_data) * 0.8 ) )

# Filling the training set
x_train = np.array([''.join(rec) for rec in preprocessed_data[0 : train_size]])
y_train = np.array([rec for rec in toxic_label[0 : train_size]])

# Filling the test set
x_test = np.array([''.join(rec) for rec in preprocessed_data[train_size + 1 : len(preprocessed_data)]])
y_test = np.array([rec for rec in toxic_label[train_size + 1 : len(toxic_label)]])

print( "Training set size:\t", len(x_train), "\nTest set size:\t\t", len(x_test) )

Training set size:	 127657 
Test set size:		 31913


In [None]:
ys_train = toxic_labels.as_matrix()[0:train_size];
ys_test = toxic_labels.as_matrix()[train_size + 1:len(toxic_label)];

# Computing TF-IDF features ( Stage 4 option 1 )

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer model 
# Ignoring terms with lower frequency than 2, range of sequences of words from 1 to 2,
# most frequent 4000 words and normalising with l2
vectorizer = TfidfVectorizer(min_df=2, 
                             ngram_range=(1, 2),  
                             stop_words='english', 
                             max_features= 500,
                             strip_accents='unicode', 
                             norm='l2')

features_train = vectorizer.fit_transform(x_train).todense()
features_test = vectorizer.transform(x_test).todense()

# Computing word2vec features ( Stage 4 option 2 )

### Choose simple word2vec or glove's word2vec

In [13]:
import pickle

word2vec_folder_dir = cwd + "/word2vec-gensim/toxic/";    
w2c_glove_file = word2vec_folder_dir + "glove.pickle"
w2c_processed_file = word2vec_folder_dir + "processed.pickle"
w2c_raw_file = word2vec_folder_dir + "raw.pickle"

# w2v_pickle = open(w2c_processed_file,"rb")
# w2v_pickle = open(w2c_raw_file,"rb")
w2v_pickle = open(w2c_glove_file, "rb")

w2v = pickle.load(w2v_pickle)
w2v_pickle.close()

w2v

{'leipold': array([ 0.54676998, -0.050584  ,  0.0966    , -0.43537   , -0.035721  ,
        -0.16199   ,  0.069204  , -0.024482  , -0.42443001, -0.38205999,
        -0.21495999,  1.17990005,  0.080393  , -1.30990005, -0.21847001,
         0.86391997,  0.52907997,  0.19146   ,  0.15487   , -0.61631   ,
        -0.91667002,  0.22857   , -0.34782001,  0.59846002, -0.33680001,
         0.85822999, -0.12687001,  0.16765   ,  0.083637  , -0.13166   ,
        -0.05179   , -0.11312   , -0.57691997,  0.35067001,  0.49746001,
         0.26041999,  0.19074   ,  0.81259   , -0.89073002,  0.23796999,
         0.40524   , -0.20348001,  0.10384   , -0.094583  , -0.29473001,
         0.57571   ,  0.025989  , -0.39048001,  0.22509   ,  0.58487999,
         0.79215002, -0.26638001, -0.21482   , -0.11555   , -0.058796  ,
         0.20652001, -0.36743999, -0.25904   , -0.47652   ,  0.71350002,
         0.80146998, -0.69103998,  0.26596999,  0.35962999,  0.22183999,
         0.30724999,  0.56237   ,  0.840

In [14]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0, dtype=np.float32)
            for words in X
        ], dtype=np.float32)

In [15]:
import pickle, os
import numpy as np

# directories
cwd = os.getcwd()
tokens_dir = cwd + "/tokens/toxic/"
raw_tokens_file_dir = tokens_dir + "raw.pickle"
processed_tokens_file_dir = tokens_dir + "processed.pickle"

## Importing preprocessed tokens

# toxic_processed_tokens_pickle = open(processed_tokens_file_dir, "rb")
# tokens = pickle.load(toxic_processed_tokens_pickle)
# toxic_processed_tokens_pickle.close()

## Importing raw tokens

toxic_raw_tokens_pickle = open(raw_tokens_file_dir, "rb")
tokens = pickle.load(toxic_raw_tokens_pickle)
toxic_raw_tokens_pickle.close()

# Splitting into train and test set (75% train - 25% set)
train_size = int( round( len(tokens) * 0.8 ) )

# Filling the training set
x_train = np.array(tokens[0 : train_size])
y_train = np.array([rec for rec in toxic_label[0 : train_size]])

# Filling the test set
x_test = np.array(tokens[train_size + 1 : len(tokens)])
y_test = np.array([rec for rec in toxic_label[train_size + 1 : len(toxic_label)]])

print( "Training set size:\t", len(x_train), "\nTest set size:\t\t", len(x_test) )

w2vec = MeanEmbeddingVectorizer(w2v)
features_train = w2vec.transform(x_train)
features_test = w2vec.transform(x_test)

Training set size:	 127657 
Test set size:		 31913


# Resolving unbalanced data (stage 4.5)

### Under-sample (WARNING: takes forever to run)

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import ClusterCentroids

# http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html

print(sorted(Counter(y_train).items()))

cc = ClusterCentroids(random_state=0)
features_train, y_train = cc.fit_sample(features_train, y_train)

print(sorted(Counter(y_train).items()))

### Over-sample

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# http://contrib.scikit-learn.org/imbalanced-learn/stable/over_sampling.html

print(sorted(Counter(y_train).items()))
features_train, y_train = SMOTE().fit_sample(features_train, y_train)
print(sorted(Counter(y_train).items()))

# Classifying ( Stage 5 )

In [None]:
from DeepNeuralModel import DeepNeural
# DeepNeural(x_train, y_train, x_test, y_test, "CNN-1", 2)

In [17]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May  8 17:40:24 2018

@author: juliocesar
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.metrics import confusion_matrix


def DeepNeural_w2v_tmp(feat_train, y_train, feat_test, y_test, idmodel, epoch = 1):
    
    print("Padding sequences in vectors...")
    # fix random seed for reproducibility
    np.random.seed(7)
    # load the dataset but only keep the top n words, zero the rest
    top_words = 10000
    # truncate and pad input sequences
    max_review_length = 100
    
    X_train = sequence.pad_sequences(feat_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(feat_test, maxlen=max_review_length)
    
    print("Building DNN model...")
    # create the model
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    print("Training model...")
    model.fit(X_train, y_train, epochs=epoch, batch_size=64)
    
    print("Saving model files...")
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("modelCNN" + idmodel + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("modelCNN" + idmodel + ".h5")
    print("Saved")
    
    print("---------------------------- Training ------------------------------------")
    prediction = model.predict_classes(X_train)
    cm = confusion_matrix(prediction, y_train)
    scores = model.evaluate(X_train, y_train, verbose=0)
    print(cm)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    
    print("------------------------------ Test --------------------------------------")
    prediction = model.predict_classes(X_test)
    cm = confusion_matrix(prediction, y_test)
    scores = model.evaluate(X_test, y_test, verbose=0)
    print(cm)
    print("Accuracy: %.2f%%" % (scores[1]*100))

In [19]:
features_train[0:20]

array([[-0.08671388,  0.07895979,  0.37730509, ..., -0.19615115,
         0.42280129,  0.16498424],
       [-0.1004766 ,  0.299784  ,  0.46914539, ..., -0.33448634,
         0.41999394, -0.09445299],
       [-0.18380076,  0.18820518,  0.44019672, ..., -0.40190932,
         0.46436751,  0.27738383],
       ..., 
       [-0.54558003,  1.09650004,  1.51059997, ..., -0.90373999,
         0.48135999,  0.030378  ],
       [-0.17459983,  0.22705872,  0.40618542, ..., -0.63571936,
         0.59623224,  0.23325092],
       [-0.13837551,  0.32241839,  0.44636592, ..., -0.27834785,
         0.4289971 ,  0.28971279]], dtype=float32)

In [None]:
from DeepNeuralModel_w2v import DeepNeural_w2v
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler(feature_range=(0, 10))
scaled_features_train = scaler.fit_transform(features_train)
scaled_features_test = scaler.fit_transform(features_test)
DeepNeural_w2v_tmp(scaled_features_train, y_train, scaled_features_test, y_test, "CNN-1", 2)

Padding sequences in vectors...
Building DNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 32)           320000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 376,405
Trainable params: 376,405
Non-trainable params: 0
_________________________________________________________________
None
Training model...
Epoch 1/2
 16960/127657 [==>...........................] - 

### RandomForest

In [37]:
from sklearn.ensemble import RandomForestClassifier

# c = penalty parameter
clf = RandomForestClassifier(n_estimators=100).fit(features_train, y_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'RandomForest'

getAccuracy(y_train, y_test, predicted_train, predicted_test, name)


RandomForest - Train Confusion Matrix

 Predicted       0      1
Actual                  
0          114936    464
1             233  12024

RandomForest - Train accuracy 0.995

RandomForest - Train Classification Report
              precision    recall  f1-score   support

          0       1.00      1.00      1.00    115400
          1       0.96      0.98      0.97     12257

avg / total       0.99      0.99      0.99    127657

------------------------------------------------------------

RandomForest - Test Confusion Matrix

 Predicted      0     1
Actual                
0          28587   289
1           1961  1076

RandomForest - Test accuracy 0.929

RandomForest - Test Classification Report
              precision    recall  f1-score   support

          0       0.94      0.99      0.96     28876
          1       0.79      0.35      0.49      3037

avg / total       0.92      0.93      0.92     31913

------------------------------------------------------------


#### SVM Classifier

In [38]:
from sklearn.svm import LinearSVC #,SVC

# c = penalty parameter
c = 1.0
clf = LinearSVC(C = c).fit(features_train, y_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'SVM'

getAccuracy(y_train, y_test, predicted_train, predicted_test, name)


SVM - Train Confusion Matrix

 Predicted       0     1
Actual                 
0          114134  1266
1            8156  4101

SVM - Train accuracy 0.926

SVM - Train Classification Report
              precision    recall  f1-score   support

          0       0.93      0.99      0.96    115400
          1       0.76      0.33      0.47     12257

avg / total       0.92      0.93      0.91    127657

------------------------------------------------------------

SVM - Test Confusion Matrix

 Predicted      0     1
Actual                
0          28565   311
1           2027  1010

SVM - Test accuracy 0.927

SVM - Test Classification Report
              precision    recall  f1-score   support

          0       0.93      0.99      0.96     28876
          1       0.76      0.33      0.46      3037

avg / total       0.92      0.93      0.91     31913

------------------------------------------------------------


### ExtraTreesClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# c = penalty parameter
clf = ExtraTreesClassifier(n_estimators=80).fit(features_train, y_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'ExtraTrees'

getAccuracy(y_train, y_test, predicted_train, predicted_test, name)

#### Multinomial Naive Bayes Classifier

In [39]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaled_features_train = scaler.fit_transform(features_train)
scaled_features_test = scaler.fit_transform(features_test)


clf = MultinomialNB().fit(scaled_features_train, y_train)

predicted_train = clf.predict(scaled_features_train)
predicted_test = clf.predict(scaled_features_test)

name = 'Naive Bayes'

getAccuracy(y_train, y_test, predicted_train, predicted_test, name)


Naive Bayes - Train Confusion Matrix

 Predicted       0
Actual           
0          115400
1           12257

Naive Bayes - Train accuracy 0.904

Naive Bayes - Train Classification Report
              precision    recall  f1-score   support

          0       0.90      1.00      0.95    115400
          1       0.00      0.00      0.00     12257

avg / total       0.82      0.90      0.86    127657

------------------------------------------------------------

Naive Bayes - Test Confusion Matrix

 Predicted      0
Actual          
0          28876
1           3037

Naive Bayes - Test accuracy 0.905

Naive Bayes - Test Classification Report
              precision    recall  f1-score   support

          0       0.90      1.00      0.95     28876
          1       0.00      0.00      0.00      3037

avg / total       0.82      0.90      0.86     31913

------------------------------------------------------------


  'precision', 'predicted', average, warn_for)


#### Multinomial Naive Bayes Classifier (multi-label)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB

clf = MultiOutputClassifier(MultinomialNB())
clf.fit(features_train, ys_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'Naive Bayes'

#### SVM Classifier (multi-label)

In [None]:
from sklearn.svm import LinearSVC #,SVC
from sklearn.multioutput import MultiOutputClassifier

c = 1.0
clf = MultiOutputClassifier(LinearSVC(C = c))
clf.fit(features_train, ys_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'SVM'

#### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

# Extreme Gradient Boost
# md = Max depth
# ss = Subsample ratio of training instance
# cs = Subsample ratio of columns when constructing each tree

md = 1
ss = 0.8
cs = 0.8
clf = XGBClassifier( max_depth = md, subsample = ss,
                        colsample_bytree = cs).fit(features_train, y_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'XGBoost'

#### XGBoost (multi-label)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from xgboost.sklearn import XGBClassifier

md = 1
ss = 0.8
cs = 0.8
clf = MultiOutputClassifier(XGBClassifier(max_depth = md, subsample = ss, colsample_bytree = cs))
clf.fit(features_train, ys_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'XGBoost'

#### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression

# C = Inverse of regularization strength
c = 1.0
clf = LogisticRegression(C = c).fit(features_train, y_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'Logistic Regression'

getAccuracy(y_train, y_test, predicted_train, predicted_test, name)


Logistic Regression - Train Confusion Matrix

 Predicted       0     1
Actual                 
0          113623  1777
1            7588  4669

Logistic Regression - Train accuracy 0.927

Logistic Regression - Train Classification Report
              precision    recall  f1-score   support

          0       0.94      0.98      0.96    115400
          1       0.72      0.38      0.50     12257

avg / total       0.92      0.93      0.92    127657

------------------------------------------------------------

Logistic Regression - Test Confusion Matrix

 Predicted      0     1
Actual                
0          28442   434
1           1871  1166

Logistic Regression - Test accuracy 0.928

Logistic Regression - Test Classification Report
              precision    recall  f1-score   support

          0       0.94      0.98      0.96     28876
          1       0.73      0.38      0.50      3037

avg / total       0.92      0.93      0.92     31913

------------------------------------

#### Logictic Regression (multi-label)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

c = 1.0
clf = MultiOutputClassifier(LogisticRegression(C = c))
clf.fit(features_train, ys_train)

predicted_train = clf.predict(features_train)
predicted_test = clf.predict(features_test)

name = 'Logistic Regression'

### Ensemble

In [None]:
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import ensemble

# create the sub models
estimators = []
estimators.append(('logistic', LogisticRegression()))
estimators.append(('cart', DecisionTreeClassifier()))
estimators.append(('svm', SVC()))

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
clf = ensemble.VotingClassifier(estimators)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# clf = ensemble.GradientBoostingClassifier(n_estimators=20, random_state=7, verbose=3)
# clf.fit(features_train, y_train)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
kfold = model_selection.KFold(n_splits=10, random_state=7)
model = ensemble.AdaBoostClassifier(n_estimators=10, random_state=7)
results = model_selection.cross_val_score(model, features_train, y_train, cv=kfold)
print(results.mean())

# Metrics ( Stage 6 ) - Single Label

In [28]:
def getAccuracy(y_train, y_test, predicted_train, predicted_test, name):

    from sklearn.metrics import classification_report, accuracy_score

    # Training confusion matrix
    print ("\n"+name+" - Train Confusion Matrix\n\n",
           pd.crosstab(y_train, predicted_train, rownames = ["Actual"], colnames = ["Predicted"]))
    # Training accuracy
    print ("\n"+name+" - Train accuracy",
           round(accuracy_score(y_train, predicted_train),3))
    # Training report
    print ("\n"+name+" - Train Classification Report\n",
           classification_report(y_train, predicted_train))

    print("------------------------------------------------------------")

    # Test confusion matrix
    print ("\n"+name+" - Test Confusion Matrix\n\n",
           pd.crosstab(y_test,predicted_test,rownames = ["Actual"], colnames = ["Predicted"]))  
    # Test accuracy
    print ("\n"+name+" - Test accuracy",
           round(accuracy_score(y_test,predicted_test),3))
    # Test report
    print ("\n"+name+" - Test Classification Report\n",
           classification_report(y_test,predicted_test))

    print("------------------------------------------------------------")


    # Getting feature names from vectorizer
    if('vectorizer' in locals() or 'vectorizer' in globals()):
        feature_names = vectorizer.get_feature_names()

        # Getting weights assigned to the features (it works only with linear kernels)
        # Empirical log probability of features given a class, P(x_i|y).
        coefs = clf.coef_

        # Smoothed empirical log probability for each class.
        intercept = clf.intercept_

        # Sorted coefs
        coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))

        print ("\n\nTop 10 features - First ten & Last ten\n")
        n = 10
        top_n_coefs = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
        for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
            # %-15s is for padding left
            print('|\t%.4f\t%-16s\t\t|\t%.4f\t%-16s|' % (coef_1, fn_1, coef_2, fn_2))

### # Metrics ( Stage 6 ) - Multi-Label

In [None]:
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

targets = ['Toxic', 'Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']

for idx in range(0, len(targets)):
    conf_mat = confusion_matrix(ys_train[:,idx], predicted_train[:,idx])
    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(conf_mat, 
                annot=True, 
                fmt='d')

    plt.title(targets[idx])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

    # Training confusion matrix
    print ("\n"+name+" - "+targets[idx]+" - Train Confusion Matrix\n\n",
           pd.crosstab(ys_train[:,idx], predicted_train[:,idx], rownames = ["Actual"], colnames = ["Predicted"]))
    # Training accuracy
    print ("\n"+name+" - "+targets[idx]+" - Train accuracy",
           round(accuracy_score(ys_train[:,idx], predicted_train[:,idx]),3))
    # Training report
    print ("\n"+name+" - "+targets[idx]+" - Train Classification Report\n",
           classification_report(ys_train[:,idx], predicted_train[:,idx]))

    print("------------------------------------------------------------")

    # Test confusion matrix
    print ("\n"+name+" - "+targets[idx]+" - Test Confusion Matrix\n\n",
           pd.crosstab(ys_test[:,idx], predicted_test[:,idx],rownames = ["Actual"], colnames = ["Predicted"]))  
    # Test accuracy
    print ("\n"+name+" - "+targets[idx]+" - Test accuracy",
           round(accuracy_score(ys_test[:,idx], predicted_test[:,idx]),3))
    # Test report
    print ("\n"+name+" - "+targets[idx]+" - Test Classification Report\n",
           classification_report(ys_test[:,idx], predicted_test[:,idx]))

    print("------------------------------------------------------------")