In [1]:
import numpy as np
import sys
import platform
import pandas as pd
from time import time
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import random
import warnings
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score,roc_curve,auc, accuracy_score, f1_score,precision_score,hamming_loss
import json
import os
#import SupportFunctions as sf
from scipy.sparse import hstack
from scipy.sparse import coo_matrix, csr_matrix
import time as tm
import gc
import psutil
from collections import OrderedDict
import xgboost as xgb
from sklearn.decomposition import TruncatedSVD
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.model_selection import IterativeStratification
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('train.csv',sep=',', encoding='ISO-8859-1')
test_data  = pd.read_csv('multiLabelTest.csv',sep=',', encoding='ISO-8859-1')
test_data.head()

Unnamed: 0,ï»¿Text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Thank you for understanding. I think very high...,0,0,0,0,0,0
1,:Dear god this site is horrible.,0,0,0,0,0,0
2,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
3,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
4,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [3]:
def clean_data(temp_ds,str_idx):
    x=temp_ds.iloc[:,idx:].sum()
    rowsums=temp_ds.iloc[:,idx:].sum(axis=1)
    temp_ds['clean']=(rowsums==0)
    
    df_majority = temp_ds[temp_ds.clean==True]
    df_minority = temp_ds[temp_ds.clean==False]

    df_majority = df_majority.sample(frac = 1)
    df_majority_downsampled = df_majority.head(40000)

    temp_ds = pd.concat([df_majority_downsampled,df_minority])
    temp_ds = temp_ds.drop('clean', axis=1)
    return temp_ds
    

In [4]:
#print(np.shape(train_data),np.shape(test_data))

#train_data = clean_data(train_data,2)
#test_data = clean_data(test_data,1)

#print(np.shape(train_data),np.shape(test_data))

In [5]:
train_y = train_data.iloc[:,2:]
train_ds = pd.DataFrame()
train_ds['comment_text'] = train_data.iloc[:,1]


test_y = test_data.iloc[:,1:]
test_ds = pd.DataFrame()
test_ds['comment_text'] = test_data.iloc[:,0]

In [6]:
for idx in train_y.columns:
    print("{} ratio is {}".format(idx,np.round(len(train_y[train_y[idx]==1])*100/len(train_y)),4))
print("\n")
for idx in test_y.columns:
    print("{} ratio is {}".format(idx,np.round(len(test_y[test_y[idx]==1])*100/len(test_y)),4))

toxic ratio is 10.0
severe_toxic ratio is 1.0
obscene ratio is 5.0
threat ratio is 0.0
insult ratio is 5.0
identity_hate ratio is 1.0


toxic ratio is 10.0
severe_toxic ratio is 1.0
obscene ratio is 6.0
threat ratio is 0.0
insult ratio is 5.0
identity_hate ratio is 1.0


In [7]:

#clean data

retainNumbers = True
useStemming = True
useLemma = False
use_hashing = True

train_ds = train_ds.fillna("-")
test_ds = test_ds.fillna("-")


In [8]:
def preprocessor(text, lemmatizer, stemmer, retainNumbers):
    import re
    from nltk import WordNetLemmatizer
    from nltk.stem.snowball import SnowballStemmer
    import sys
    import pandas as pd
    # good overall paper on what stemmers and lemmatizer do
    # http://www.kenbenoit.net/courses/tcd2014qta/readings/Jivani_ijcta2011020632.pdf

    if pd.isnull(text) == True:
        return ""

    if retainNumbers == True:
        #remove any character not in the listed range
        text = re.sub("[^0-9a-zA-Z\.]+", " ", text)
    else:
        text = re.sub("[^a-zA-Z]+", " ", text)

    #remove extra whitespace
    text = re.sub("\s+", " ", text)
    text = text.lower()

    #split text

    # http://www.nltk.org/api/nltk.stem.html#nltk.stem.wordnet.WordNetLemmatizer
    if(lemmatizer == True):

        try:
            wordnet_lemmatizer = WordNetLemmatizer()
            newText = ""
            for word in text.split():
                newText = " ".join((newText, wordnet_lemmatizer.lemmatize(word)))
            text = newText.strip(" ")
            #text = wordnet_lemmatizer.lemmatize(text)
        except Exception as e:
            print(e)
            print("--- downloading nltk wordnet corpora")
            import nltk
            nltk.download('wordnet')
            newText = ""

            for word in text.split():
                newText = " ".join((newText, wordnet_lemmatizer.lemmatize(word)))
            text = newText.strip(" ")

    #http://www.nltk.org/howto/stem.html
    if (stemmer == True):
        try:
            stm = SnowballStemmer("english")
            newText = ""
            for word in text.split():
                newText = " ".join((newText, stm.stem(word)))
            text = newText.strip(" ")
        except Exception as e:
            print(e)
            print("--- downloading nltk snowball data")
            import nltk
            nltk.download('snowball_data')
            newText = ""
            for word in text.split():
                newText = " ".join((newText, stm.stem(word)))
            text = newText.strip(" ")

    return text

In [9]:
train_ds['comment_text'] = train_ds['comment_text'].apply(preprocessor, args=(useLemma, useStemming, retainNumbers))
test_ds['comment_text'] = test_ds['comment_text'].apply(preprocessor, args=(useLemma, useStemming, retainNumbers))


In [10]:
train_data = train_ds['comment_text'].values.ravel()
test_data = test_ds['comment_text'].values.ravel()

In [11]:
print("using TF-IDF")

minDocFreq = 1

vectorizer = TfidfVectorizer(use_idf=True,sublinear_tf=True, analyzer=u'word',stop_words='english',min_df=minDocFreq,ngram_range=(1, 1), max_features=5000)
train_data = vectorizer.fit_transform(train_data)
vocab = vectorizer.vocabulary_
print("vocab length :- " + str(len(vocab)))

test_data = vectorizer.transform(test_data)

print(np.shape(train_data))
print(np.shape(test_data))

using TF-IDF
vocab length :- 5000
(159571, 5000)
(63978, 5000)


In [12]:
svd = TruncatedSVD(n_components=300)
svd.fit(train_data)
train_data = svd.transform(train_data)
test_data = svd.transform(test_data)

In [13]:
def Nfold_Cross_Valid(X, y, clf):
    scores=[]
    scores_f1= []
    scores_hamming = []

    #X =np.array(X)
    y =np.array(y)
    #ss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=21)
    #ss = KFold(len(y), n_folds=3,shuffle=True,indices=None)
    #ss = KFold(n_splits=3, random_state=21, shuffle=True)
    
    train_size=0.5
    #ss = IterativeStratification(n_splits=2, order=1)  
    ss = IterativeStratification(
        n_splits=3, order=1)#, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    i = 1

    for trainCV, testCV in ss.split(X, y):
        X_train, X_test= X[trainCV], X[testCV]
        y_train, y_test= y[trainCV], y[testCV]

        clf.fit(X_train, y_train)
        #y_pred=clf.predict_proba(X_test)[:,1]
        y_pred = clf.predict(X_test)

        scores.append(accuracy_score(y_test,y_pred))
        print(" %d-iteration...Accuracy %s " % (i,scores))
        
        #scores_f1.append(f1_score(y_true=y_test,y_pred=y_pred,average='weighted'))
        #print(" %d-iteration...F1 %s " % (i,scores_f1))
        
        scores_hamming.append(hamming_loss(y_true=y_test,y_pred=y_pred))
        print(" %d-iteration...Hamming %s " % (i,scores_hamming))
        
        hamming_loss
        
        i = i + 1

    #Average ROC from cross validation
    scores=np.array(scores)
    print ("Acc CV Score:",np.mean(scores))
    #print ("F1 CV Score:",np.mean(np.array(scores_f1)))
    print ("Hamming CV Score:",np.mean(np.array(scores_hamming)))

    print("***************Ending Kfold Cross validation***************")

    return scores

In [15]:
#RF training
clf = BinaryRelevance(RandomForestClassifier(n_jobs=-1, n_estimators=30,max_depth = 10))
Nfold_score = Nfold_Cross_Valid(train_data, train_y, clf)
clf.fit(train_data, train_y)

 1-iteration...Accuracy [0.9085069716434279] 
 1-iteration...Hamming [0.025834247219175936] 
 2-iteration...Accuracy [0.9085069716434279, 0.9107316308945637] 
 2-iteration...Hamming [0.025834247219175936, 0.02602224659251136] 
 3-iteration...Accuracy [0.9085069716434279, 0.9107316308945637, 0.904558984803384] 
 3-iteration...Hamming [0.025834247219175936, 0.02602224659251136, 0.026570578098073006] 
 4-iteration...Accuracy [0.9085069716434279, 0.9107316308945637, 0.904558984803384, 0.9103757325060324] 
 4-iteration...Hamming [0.025834247219175936, 0.02602224659251136, 0.026570578098073006, 0.02534131386251345] 
 5-iteration...Accuracy [0.9085069716434279, 0.9107316308945637, 0.904558984803384, 0.9103757325060324, 0.9063136456211812] 
 5-iteration...Hamming [0.025834247219175936, 0.02602224659251136, 0.026570578098073006, 0.02534131386251345, 0.02698313227844796] 
Acc CV Score: 0.9080973930937178
Hamming CV Score: 0.026150303610144345
***************Ending Kfold Cross validation*********

BinaryRelevance(classifier=RandomForestClassifier(max_depth=10, n_estimators=30,
                                                  n_jobs=-1),
                require_dense=[True, True])

In [16]:
#RF prediction
pred_y = clf.predict(test_data)
print(accuracy_score(test_y,pred_y))
print(hamming_loss(test_y,pred_y))

0.9072649973428366
0.028376942073837882
