<a href="https://colab.research.google.com/github/abhranil-datascience/LSTM_POC/blob/master/ToxicCommentsCNN_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
############################## Mount Drive ######################################## 
from google.colab import drive
drive.mount('/content/gdrive')
############################## Change Directory ###################################
#import os
#os.chdir('/content/gdrive/My Drive/MLandDLFullCourse/DL/AdvancedNLP/1.ToxicComments/Downloads')
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!ls
#!unzip toxic-comments-classification-master.zip glove.6B.zip
#!unzip glove.6B.zip
######## Change to Appropriate Directory ############
import os
os.chdir('/content/gdrive/My Drive/MLandDLFullCourse/DL/AdvancedNLP/1.ToxicComments')
################## Declaring Variables #########################
MaxVocabSize=20000
Dimension=100
BestNLPModel="BestToxicCommentsCNN.hdf5"
BatchSize=150
MaxSequenceLengthTrain=0
################## Step 1: Read Dataset ########################

import pandas as pd
### Training ###
TrainDataset=pd.read_csv('train.csv')
XTrain=TrainDataset.iloc[:,1:2].fillna("DUMMY_VALUE").values
YTrain=TrainDataset.iloc[:,2:8].values
### Testing ###
TestDataset=pd.read_csv('test.csv')
XTest=TestDataset.iloc[:,1:2].fillna("DUMMY_VALUE").values

########## Step 2: Load pretrained wordvectors #################

def LoadPretrainedVectors():
  path_to_embedding_matrix="/content/gdrive/My Drive/MLandDLFullCourse/DL/AdvancedNLP/1.ToxicComments/Downloads/Glove6BUnzipped/glove.6B."+str(Dimension)+"d.txt"
  import numpy as np
  WordToVec={}
  with open(path_to_embedding_matrix) as f:
    for line in f:
      values=line.split()
      word=values[0]
      vec=np.array(values[1:],dtype=float)
      WordToVec[word]=vec
  return WordToVec
    
############### Step 3: Tokenize Each Sentence ###################

def TokenizeData(Dataset):
  from keras.preprocessing.text import Tokenizer
  tokenizer=Tokenizer(num_words=MaxVocabSize)
  tokenizer.fit_on_texts(Dataset[:,0])
  datasequences=tokenizer.texts_to_sequences(Dataset[:,0])
  ### Check if first item in the sequences correctly represents all words ###
  #for item in datasequences[0]:
  #  for word,index in tokenizer.word_index.items():
  #    if(index==item):
  #      print("Word is: "+word+" || Index is "+str(index))
  ### Check min and max sequence length ###
  #print ("Min sequence length: "+ str(min(len(s) for s in datasequences)))
  #print ("Max sequence length: "+ str(max(len(s) for s in datasequences)))
  return tokenizer,datasequences

########### Step 4: Create Word To Integer Mapping ##################

def CreateIntToWordMapping(DataTokenizer):
  IntToWordMapping={val:word for word,val in DataTokenizer.word_index.items() if val<=MaxVocabSize}
  return IntToWordMapping

#################### Step 5: Pad Sequences ##########################

def PadData(InputSequence,MaxSequenceLength):
  from keras.preprocessing.sequence import pad_sequences
  PaddedSequence=pad_sequences(sequences=InputSequence,padding='post',maxlen=MaxSequenceLength)
  return PaddedSequence

############### Step 6: Create Embedding Matrix #####################

def CreateEmbeddingMatrix(Mapping):
  Max_Num_Of_Words=min(MaxVocabSize,len(Mapping))
  import numpy as np
  Embedded=np.zeros((Max_Num_Of_Words,Dimension))
  for items in Mapping:
    EmbeddedValue=None
    CurrentWord=Mapping[items]
    try:
      EmbeddedValue=WordToVec[CurrentWord]
    except:
      ()
    if(EmbeddedValue is not None):
      Embedded[items-1]=EmbeddedValue
  return Embedded, Max_Num_Of_Words
    
############### Step 7: Create Embedding Layer #######################

def CreateModel(num_of_words,EmbeddedMatrix,PaddedDataset,OutputLabel):
  from keras.layers import Embedding
  EmbeddedLayer=Embedding(input_dim=num_of_words,output_dim=Dimension,weights=[EmbeddedMatrix],input_length=PaddedDataset.shape[1],trainable=False)
  ################## Step 8: Create Model #######################
  from keras.layers import Dense, Input, Conv1D, MaxPool1D, GlobalMaxPool1D, Dropout,BatchNormalization, Flatten
  from keras.models import Model
  InputData=Input(shape=(PaddedDataset.shape[1],))
  x=EmbeddedLayer(InputData)
  ###################### Layer 1 #############################
  x=Conv1D(filters=128,kernel_size=3,strides=1,activation='relu')(x)
  x=MaxPool1D(pool_size=3,strides=1)(x)
  x=BatchNormalization()(x)
  x=Dropout(0.2)(x)
  ###################### Layer 2 #############################
  x=Conv1D(filters=256,kernel_size=3,strides=1,activation='relu')(x)
  x=MaxPool1D(pool_size=3,strides=1)(x)
  x=BatchNormalization()(x)
  x=Dropout(0.2)(x)
  ###################### Layer 3 #############################
  x=Conv1D(filters=512,kernel_size=3,strides=1,activation='relu')(x)
  x=MaxPool1D(pool_size=3,strides=1)(x)
  x=Flatten()(x)
  ###################### Layer 4 #############################
  x=Dense(units=256,activation='relu',kernel_initializer='random_uniform')(x)
  x=BatchNormalization()(x)
  x=Dropout(0.2)(x)
  ###################### Layer 5 #############################
  x=Dense(units=128,activation='relu',kernel_initializer='random_uniform')(x)
  x=BatchNormalization()(x)
  x=Dropout(0.2)(x)
  ###################### Layer 6 #############################
  x=Dense(units=64,activation='relu',kernel_initializer='random_uniform')(x)
  x=BatchNormalization()(x)
  x=Dropout(0.2)(x)
  ###################### Layer 7 #############################
  output=Dense(units=OutputLabel.shape[1],activation='sigmoid',kernel_initializer='random_uniform')(x)
  #################### Compile Model ###############################
  model_CNN=Model(InputData,output)
  model_CNN.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
  return model_CNN

################## Step 9: Train Model ######################
from keras.callbacks import ModelCheckpoint
#------------------>>>>>1
WordToVec=LoadPretrainedVectors()
#------------------>>>>>2
TrainTokenizer,TrainDataSequence=TokenizeData(XTrain)
#------------------>>>>>3
TrainIntToWordMapping=CreateIntToWordMapping(TrainTokenizer)
#------------------>>>>>4
MaxSequenceLengthTrain=max(len(s) for s in TrainDataSequence)
PaddedTrainSequence=PadData(TrainDataSequence,MaxSequenceLengthTrain)
#------------------>>>>>5
EmbeddedMatrixTrain, MaxNumOfWordsTrain=CreateEmbeddingMatrix(TrainIntToWordMapping)
#------------------>>>>>6
CNNModel=CreateModel(MaxNumOfWordsTrain,EmbeddedMatrixTrain,PaddedTrainSequence,YTrain)
#------------------>>>>>7
SaveBestModel=ModelCheckpoint(filepath=BestNLPModel,monitor='val_acc', verbose=1, save_best_only=True, mode='max')
CNNModel.fit(x=PaddedTrainSequence,y=YTrain,epochs=10,batch_size=BatchSize,validation_split=0.2,callbacks=[SaveBestModel])
##### Validation Accuracy: 0.97751
################ Step 10: Predict #######################
from keras.models import load_model
TestModel=load_model(BestNLPModel)
TestTokenizer,TestDataSequence=TokenizeData(XTest)
PaddedTestSequence=PadData(TestDataSequence,MaxSequenceLengthTrain)
Prediction=TestModel.predict(PaddedTestSequence)