In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from gensim.models import Word2Vec

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Input, Dense, Dropout, Flatten,MaxPooling1D
from tensorflow.keras.layers import Conv1D,Concatenate
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Sequential,Model,load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

In [4]:
import pickle


# Loading the data

In [5]:
train_data=pd.read_csv('C:/train.csv')
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
test_data=pd.read_csv('C:/test.csv',engine='python')
test_data.head()

Unnamed: 0,id,comment_text
0,0001ea8717f6de06,Thank you for understanding. I think very high...
1,000247e83dcc1211,:Dear god this site is horrible.
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig..."
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ..."
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l..."


# Data preperation

In [7]:
# defing stopwords manually
STOP_WORDS= set([ 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't",'utc'])


# initializzing for lemmatizing
wnl = WordNetLemmatizer()

# function for text cleaning
def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    x = re.sub("\S*\d\S*", "", str(x)).strip()
    x = re.sub('[^A-Za-z0-9]+', ' ', str(x))
    #Stop word removal and Applying WordNetLemmatizer
    x = ' '.join(wnl.lemmatize(w) for w in x.split() if w not in STOP_WORDS)
  
    return x


In [8]:
# Applying the function for text cleaning
train_data['cleaned_comment']=train_data['comment_text'].apply(preprocess)

# Before cleaning

In [9]:
train_data['comment_text'].values[1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

# After cleaning

In [10]:
train_data['cleaned_comment'].values[1]

'aww match background colour seemingly stuck thanks talk january'

In [11]:
# Tokenizing 
token_comment=[nltk.word_tokenize(comment) for comment in train_data['cleaned_comment']]

In [12]:
token_comment[:2]

[['explanation',
  'edits',
  'made',
  'username',
  'hardcore',
  'metallica',
  'fan',
  'reverted',
  'not',
  'vandalism',
  'closure',
  'gas',
  'voted',
  'new',
  'york',
  'doll',
  'fac',
  'please',
  'not',
  'remove',
  'template',
  'talk',
  'page',
  'since',
  'retired'],
 ['aww',
  'match',
  'background',
  'colour',
  'seemingly',
  'stuck',
  'thanks',
  'talk',
  'january']]

# Word embeddings using Word2vec

In [12]:
# embedding dim
embedding_dim=200

In [14]:
word_vec=Word2Vec(sentences=token_comment,size=embedding_dim,workers=4,min_count=1)

In [15]:
words=list(word_vec.wv.vocab)

In [16]:
print('Vocabulary size:',len(words))

Vocabulary size: 153095


# Checking trained word embeddings

In [17]:
# Similar words
word_vec.wv.most_similar('horrible')

[('terrible', 0.7425998449325562),
 ('ruined', 0.7412578463554382),
 ('crazy', 0.737591564655304),
 ('idiotic', 0.7091394662857056),
 ('ugly', 0.6941457986831665),
 ('pathetic', 0.6875572204589844),
 ('scary', 0.6873337030410767),
 ('lonely', 0.6808360815048218),
 ('ashamed', 0.6794235706329346),
 ('jackass', 0.6750772595405579)]

In [18]:
# King man woman test
vec = word_vec.wv['king']-word_vec.wv['man']+word_vec.wv['woman']

In [19]:
word_vec.most_similar([vec])

  """Entry point for launching an IPython kernel.


[('emperor', 0.7440541982650757),
 ('king', 0.7197107076644897),
 ('descendant', 0.715717613697052),
 ('bishop', 0.7148284912109375),
 ('father', 0.7076239585876465),
 ('prince', 0.704015851020813),
 ('gaunt', 0.6994044184684753),
 ('dynasty', 0.698919415473938),
 ('ruler', 0.6958448886871338),
 ('norman', 0.6899572610855103)]

In [20]:
# Save the built word2vec model
word_vec.wv.save_word2vec_format('cnn_word2vec.txt',binary=False)

# Preparing data in the required format

In [13]:
# Extract word embeddings from the saved word2vec model
embeddings_index={}
f=open(os.path.join('','cnn_word2vec.txt'),encoding='utf-8')
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:])
    embeddings_index[word]=coefs
f.close()

In [14]:
# tokenzing the sequences
tokenizer_obj=Tokenizer()  
tokenizer_obj.fit_on_texts(token_comment)


In [18]:
with open('tokenizer_toxic.pickle', 'wb') as handle:
    pickle.dump(tokenizer_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
with open('tokenizer_toxic.pickle', 'rb') as handle:
    tokenizer_obj = pickle.load(handle)

In [15]:
sequences = tokenizer_obj.texts_to_sequences(token_comment)


In [16]:
# Max length of sequence
def FindMaxLength(lst): 
    maxList = max(lst, key = lambda i: len(i)) 
    maxLength = len(maxList) 
      
    return maxLength 
# Driver Code 
lst = sequences
print('Max length:',FindMaxLength(lst))

Max length: 1250


In [17]:
max_length=1250

In [18]:
#pad sequences to the max length
word_index = tokenizer_obj.word_index
print('Found %s unique tokens'% len(word_index))
comment_pad=pad_sequences(sequences,maxlen=max_length)

Found 153095 unique tokens


In [19]:
# Creating an embedding matrix

In [20]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,embedding_dim))
for word,i in word_index.items():
    if i>num_words:
        continue
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

# Splitting the data for train and test

In [21]:
X_train,X_test,Y_train,Y_test=train_test_split(comment_pad,train_data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']],test_size=0.20,random_state=42)

In [22]:
X_train.shape

(127656, 1250)

In [23]:
Y_train.shape

(127656, 6)

In [24]:
Y_train=np.array(Y_train)

In [25]:
Y_test=np.array(Y_test)

In [26]:
Y_train.shape

(127656, 6)

# Defining the multichannel Cnn model for 3,4 and 5 grams

In [27]:
def define_model(length, vocab_size):
    
    inputs = Input(shape=(length,))
    # channel 1
    embedding1 = Embedding(num_words, 200,input_length = max_length,
              weights = [embedding_matrix],
              trainable = False,mask_zero=True)(inputs)
    conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    #inputs2 = Input(shape=(length,))
    embedding2 = Embedding(num_words, 200,input_length = max_length,
              weights = [embedding_matrix],
              trainable = False,mask_zero=True)(inputs)
    conv2 = Conv1D(filters=64, kernel_size=4, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    #inputs3 = Input(shape=(length,))
    embedding3 = Embedding(num_words, 200,input_length = max_length,
              weights = [embedding_matrix],
              trainable = False,mask_zero=True)(inputs)
    conv3 = Conv1D(filters=64, kernel_size=5, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    concatenate=Concatenate(axis=1)
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(6, activation='sigmoid')(dense1)
    model = Model(inputs=inputs, outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model


In [28]:
model = define_model(max_length, num_words)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1250)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1250, 200)    30619200    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1250, 200)    30619200    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1250, 200)    30619200    input_1[0][0]                    
______________________________________________________________________________________________

In [29]:
# Training the model

In [30]:
model.fit(X_train, Y_train, epochs=5, batch_size=32,validation_data=(X_test,Y_test),verbose=2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
127656/127656 - 207s - loss: 0.0774 - accuracy: 0.9740 - val_loss: 0.0652 - val_accuracy: 0.9781
Epoch 2/5
127656/127656 - 202s - loss: 0.0627 - accuracy: 0.9777 - val_loss: 0.0626 - val_accuracy: 0.9791
Epoch 3/5
127656/127656 - 202s - loss: 0.0587 - accuracy: 0.9789 - val_loss: 0.0633 - val_accuracy: 0.9789
Epoch 4/5
127656/127656 - 202s - loss: 0.0565 - accuracy: 0.9793 - val_loss: 0.0617 - val_accuracy: 0.9793
Epoch 5/5
127656/127656 - 202s - loss: 0.0546 - accuracy: 0.9798 - val_loss: 0.0612 - val_accuracy: 0.9790


<tensorflow.python.keras.callbacks.History at 0x1512821bb08>

In [31]:
model.save('cnn_toxic_new.h5')