In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Import train and test dataset

In [2]:

train_file = "train.csv";
test_file = "train.csv";

train_read = pd.read_csv(train_file)
train_read.head()

test_read = pd.read_csv(test_file)
test_read.head()
print(test_read.shape)

(159571, 8)


In [3]:
#reindexing
train_read = train_read.reindex(np.random.permutation(train_read.index))

In [4]:
# seprating comments and labels
comment = train_read['comment_text']
comment.head()
comment = comment.as_matrix()

  after removing the cwd from sys.path.


In [5]:
label = train_read[['toxic', 'severe_toxic' , 'obscene' , 'threat' , 'insult' , 'identity_hate']]
print(label.head())
label = label.as_matrix()

        toxic  severe_toxic  obscene  threat  insult  identity_hate
104669      0             0        0       0       0              0
38200       0             0        0       0       0              0
69954       0             0        0       0       0              0
151782      1             0        1       0       0              0
102767      0             0        0       0       0              0


  This is separate from the ipykernel package so we can avoid doing imports until


## Data preprocession and visualizing

In [6]:
# Number of comments labelled as toxic(Any type) 

total_comment = len(comment)
total_toxic_comment = 0

for i in range(label.shape[0]):
    count = np.count_nonzero(label[i])
    if count >= 1 :
        total_toxic_comment = total_toxic_comment + 1

print("Total number of Toxic comment")
print(total_toxic_comment)


Total number of Toxic comment
16225


In [7]:
# find average length of comments
lengths_of_all_comments = []

for i in range(total_comment):
    lengths_of_all_comments.append(len(comment[i]))
    
average = sum(lengths_of_all_comments)/total_comment
print('The Average length of Comment: {:.3f}'.format(average))

The Average length of Comment: 394.073


In [8]:
# remove excess length comments
comments = []
labels = []
for i in range(total_comment):
    if(len(comment[i]) <= average):
        comments.append(comment[i])
        labels.append(label[i])
        
print("After filtering comments according to average length ")
print(len(comments))

After filtering comments according to average length 
115232


# Preprocessing

1. Removing stop words
2. removing punctuations
3. stemming and lemmitization
4. Word counts using CountVectorizer
5. Splitting dataset into Training and Testing

In [9]:
#1) removing stop words

#import stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

#also the single letter words are of no use and can be removed from the comments. So appending letters from 'b' to 'z'

for x in range(ord('b'), ord('z')+1):
    stopwords.append(chr(x))
    

print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshaykokane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#2) Preparing punctuations that need to be removed

import string
print(string.punctuation)
punctuation_edit = string.punctuation.replace('\'','') +"0123456789"
print (punctuation_edit)
outtab = "                                         "
trantab = str.maketrans(punctuation_edit, outtab)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0123456789


In [11]:
#3) initalize stemming and lemminization

lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()
#download words from wordnet library
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akshaykokane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
for i in range(len(comments)):
    comments[i] = comments[i].lower().translate(trantab) #remove ounctutations
    outPut = []
    
    for word in comments[i].split():#apply stemming and leminization
        outPut.append(stemmer.stem(lemmatiser.lemmatize(word,pos="v")))
    comments[i] = " ".join(outPut)
               
        

In [13]:
#Word counts using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#create object supplying our custom stop words
count_vector = CountVectorizer(stop_words=stopwords)
#fitting it to converts comments into bag of words format
tf = count_vector.fit_transform(comments).toarray()

In [14]:
# print(count_vector.get_feature_names())
print(tf.shape)

(115232, 71731)


In [16]:
# Divide into test and train
def shuffle(matrix, target, test_proportion):
    
    ratio = int(matrix.shape[0]/test_proportion)
    matrix = np.array(matrix)
    target = np.array(target)
    X_train = matrix[ratio:,:]
    X_test =  matrix[:ratio,:]
    Y_train = target[ratio:,:]
    Y_test =  target[:ratio,:]
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = shuffle(tf, labels,3)

print(X_test.shape)
print(X_train.shape)

(38410, 71731)
(76822, 71731)


##  Model

In [18]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
classifier = BinaryRelevance(classifier = SVC(), require_dense = [False, True])
classifier.fit(X_train, Y_train)



BinaryRelevance(classifier=SVC(C=1.0, cache_size=200, class_weight=None,
                               coef0=0.0, decision_function_shape='ovr',
                               degree=3, gamma='auto_deprecated', kernel='rbf',
                               max_iter=-1, probability=False,
                               random_state=None, shrinking=True, tol=0.001,
                               verbose=False),
                require_dense=[False, True])

In [19]:
#predictions
predictions = classifier.predict(X_test)

#calculate scores
evaluate_score(Y_test,predictions)

NameError: name 'evaluate_score' is not defined