# 31009 - Final Project - CNN Model
### Ada, Rohit, Dylan

In [2]:
import numpy as np  
import pandas as pd 
import re   
import nltk  
from nltk.corpus import stopwords           
from nltk.stem.porter import PorterStemmer
from collections import Counter  
import seaborn as sns 
import matplotlib.pyplot as plt     
from IPython.core.display import display, HTML  
import string
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Convolution1D, Flatten, LeakyReLU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Concatenate
from tqdm import tqdm  
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras import callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import load_model

from tensorflow.keras.datasets import imdb

from tensorflow.keras.utils import model_to_dot

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
##Load Data
train = pd.read_csv("Cleaned_Train.csv")
train_y = train.target


In [65]:
train.shape

(7613, 5)

In [66]:
# Tokenizer sequence and index words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.text)   
word_index = tokenizer.word_index    
num_words = len(tokenizer.word_index)+1
print('Number of unique words:',len(word_index))

Number of unique words: 17440


In [67]:
training_sequences = tokenizer.texts_to_sequences(train.text)  


# Ading padding at the front of text sequence
training_padded = pad_sequences(training_sequences,                                  
                                   maxlen=50,                                      
                                   padding='pre',                           
                                   truncating='pre')  

# Split data set for further training and validation
X_train, X_test, Y_train, Y_test = train_test_split(training_padded, train_y, test_size=.25,random_state=0)

In [68]:
# Matching with Glove embedding 6B.300D

embedding_dict={}
with open('glove.6B.300d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

embedding_dim=300
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in tqdm(word_index.items()):
    if i < num_words:
        embedding_vector = embedding_dict.get(word)  
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_matrix.shape

100%|██████████| 17440/17440 [00:00<00:00, 242764.75it/s]


(17441, 300)

## CNN1D with Glove

In [69]:

# Define Model Hyperparameters
embedding_dim = 300

# initial filters and filter size
num_filters = 10
filter_size = 3

# Here we specify the number of units of our hidden layer
hidden_dims = 5

# Training batch size and epochs
batch_size = 10

num_epochs = 10

sequence_length = 50

#Specify the number of classes to predict (1 for binary classification or count unique values for multilabel classification)
num_classes = 1


In [70]:
# Building the model
model = Sequential()
model.add(Embedding(num_words,
            embedding_dim,
            weights=[embedding_matrix],
            input_length=sequence_length,
            trainable=False))

model.add(Convolution1D(filters=10,
                         kernel_size=3,
                         padding="valid",
                         activation="sigmoid",
                         strides=1))

model.add(GlobalMaxPooling1D())
model.add(Dense(5))
model.add(Dropout(0.2))
model.add(Activation('sigmoid'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           5232300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 10)            9010      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 55        
_________________________________________________________________
dropout_1 (Dropout)          (None, 5)                 0         
_________________________________________________________________
activation_1 (Activation)    (None, 5)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [71]:
#Fit the mode and evaluate the model
model_1_fit = model.fit(X_train, Y_train, validation_split=.25, epochs=10, batch_size=10)
model.evaluate(X_test, Y_test, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.45893147587776184, 0.7941176295280457]

In [72]:
#Save model file to disk
model_json = model.to_json()
with open("cnnmodel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("cnnmodel.h5")
print("Saved model to disk")

Saved model to disk
