# CNN

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow_transform as tft

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers
from tensorflow.python.keras import utils

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

In [10]:
#load cleaned datasets
train = pd.read_csv('train_x.csv', header=None)
train_y = pd.read_csv('train_class.csv', header=None)
valid = pd.read_csv('test_x.csv', header=None)
valid_y = pd.read_csv('test_class.csv', header=None)

In [11]:
train_list = train[0].tolist()
valid_list = valid[0].tolist()

In [12]:
print(len(valid_list),
      len(train_list))

8103 24308


# Create vocabulary representation


In [13]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 100
max_seq=100
def sequence_vectorize(train_texts, val_texts):
     
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)
    tokenizer.num_words = TOP_K
    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    average = sum(len(word) for word in x_train) / len(x_train)
    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the end and sequences longer are truncated
    # at the end.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length, padding= 'post')
    x_val = sequence.pad_sequences(x_val, maxlen=max_length, padding= 'post')
    return x_train, x_val, tokenizer.word_index, max_length, average




In [14]:

x_train, x_val, word_index, max_length, average = sequence_vectorize(train_list, valid_list)

In [15]:
print(average)

28.56726180681257


In [16]:
#make usable label vectors

y_train = np.asarray(train_y[0])
y_valid =np.asarray(valid_y[0])

In [17]:
print('Shape of data tensor:', x_train.shape)
print('Shape of label tensor:', y_train.shape)

print('Shape of data tensor:', x_val.shape)
print('Shape of label tensor:', y_valid.shape)

Shape of data tensor: (24308, 100)
Shape of label tensor: (24308,)
Shape of data tensor: (8103, 100)
Shape of label tensor: (8103,)


In [18]:
vocab_size=(len(word_index))
print(vocab_size)

27290


# Create Embedding layers
* GloVe
* word2vec
* random

In [19]:
#create embedding Matrix with gloVE


# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding="utf8") #100 or 50 have to choose
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))



print('Found %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size+1, 100)) #100 = embedding_dimension or 50

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
    
      
# non trainable        
GoVeEmbedding = Embedding(vocab_size+1, 100, weights=[embedding_matrix], input_length=100, trainable=False )##100 or50

#trainable
GoVeEmbeddingTrain = Embedding(vocab_size+1, 100, weights=[embedding_matrix], input_length=100, trainable=True)##100 or50

Loaded 400000 word vectors.
Found 400000 word vectors.


In [39]:
#create embedding Matrix with word2vec

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors

#load vectors
#word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


#load vectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit= 10 ** 5)

In [40]:

EMBEDDING_DIM=300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=vocab_size:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

#del(word_vectors)

In [41]:
#embedding Layer
#non trainable
word2vecEmbedding = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_seq,
                            trainable=False)
#trainable
word2vecEmbeddingTrain = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_seq,
                            trainable=True)

In [42]:
#trainable embedding layer

embedding_layer_trainable= Embedding(input_dim=vocab_size,
                                               output_dim=300,
                                               input_length=max_seq)

# Create CNN functions

In [26]:

def define_model(filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 embedding_layer
                ):


    model = models.Sequential()

    # Add embedding layer. Defined as parameter


    model.add(embedding_layer)

    #dropout layer
    model.add(Dropout(rate=dropout_rate))
    #convolutional layers
    model.add(Conv1D(filters=filters,
                     kernel_size=kernel_size,
                     activation='relu',
                     input_shape= (max_seq, embedding_dim),
                     #bias_initializer='random_uniform',
                     #kernel_initializer='random_uniform',
                     padding='valid'))
    
    model.add(Conv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  #bias_initializer='random_uniform',
                                  #kernel_initializer='random_uniform',
                                  padding='valid'))
    #pooling layer
    model.add(MaxPooling1D())

    model.add(Dropout(rate=dropout_rate))
    model.add(Conv1D(filters=filters ,
                              kernel_size=kernel_size+1,
                              activation='relu',
                              #bias_initializer='random_uniform',
                              #kernel_initializer='random_uniform',
                              padding='valid'))
    model.add(Conv1D(filters=filters ,
                              kernel_size=kernel_size+2,
                              activation='relu',
                              #bias_initializer='random_uniform',
                              #kernel_initializer='random_uniform',
                              padding='valid'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(rate=dropout_rate))
    
    #dense, final prediction layer, binary problem
    model.add(Dense(1, 'sigmoid')) 
    return model



In [31]:
#training hyperparameters
def train_model(learning_rate,
                epochs,
                batch_size,
                dropout_rate,
                model):

# Compile model with learning parameters.

    loss = 'binary_crossentropy'

    keras.optimizers.SGD(lr=learning_rate)

    model.compile(optimizer='sgd', loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

#     Train and validate model.
    history = model.fit(
        x_train,
        y_train,
        epochs=epochs,
        callbacks=callbacks,
        #validation_data=(x_vall, y_vall),
        validation_split=0.2,
        verbose=1,  # Logs once per epoch.
        batch_size=batch_size)

    return history

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    
    return history['val_acc'][-1], history['val_loss'][-1]


# Train and test different model variations

In [43]:
#GloVe embedding with fixed embedding

model_GloVe_fix =define_model(filters=120,   
                              kernel_size=3,
                              embedding_dim=100,
                              dropout_rate=0.2,
                              pool_size=2,
                              embedding_layer= GoVeEmbedding)

model_GloVe_fix.summary()

train_model(learning_rate= 0.001,
                epochs=100,
                batch_size=64,
                dropout_rate=0.2,
                model=model_GloVe_fix
                )

#evaluate model

model_GloVe_fix.evaluate(x_val, y_valid)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          2729100   
_________________________________________________________________
dropout_18 (Dropout)         (None, 100, 100)          0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 98, 120)           36120     
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 96, 120)           43320     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 48, 120)           0         
_________________________________________________________________
dropout_19 (Dropout)         (None, 48, 120)           0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 45, 120)           57720     
__________

[0.29670873278639104, 0.8707886]

In [45]:
y_pred = np.around(model_GloVe_fix.predict(x_val)).astype(int)

print('F1 score: ')
print(sklearn.metrics.f1_score(y_valid, y_pred))
print('Precision: ')
print(sklearn.metrics.precision_score(y_valid, y_pred, pos_label=1))
print('Recall:')
print(sklearn.metrics.recall_score(y_valid, y_pred, pos_label=1))


# Save model.
model_GloVe_fix.save('GloVe_fix_CNN_model.h5')

F1 score: 
0.8739617190321415
Precision: 
0.8638743455497382
Recall:
0.8842874543239951
Recall:


In [46]:
#GloVe embedding with trainable embedding vectors

model_GloVe_train =define_model(filters=120,   
                              kernel_size=3,
                              embedding_dim=100,
                              dropout_rate=0.2,
                              pool_size=2,
                              embedding_layer= GoVeEmbeddingTrain)

model_GloVe_train.summary()

train_model(learning_rate= 0.001,
                epochs=100,
                batch_size=64,
                dropout_rate=0.2,
                model=model_GloVe_train
                )

#evaluate model

model_GloVe_train.evaluate(x_val, y_valid)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          2729100   
_________________________________________________________________
dropout_21 (Dropout)         (None, 100, 100)          0         
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 98, 120)           36120     
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 96, 120)           43320     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 48, 120)           0         
_________________________________________________________________
dropout_22 (Dropout)         (None, 48, 120)           0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 45, 120)           57720     
__________

[0.28979905418234225, 0.87473774]

In [48]:
y_pred = np.around(model_GloVe_train.predict(x_val)).astype(int)

print('F1 score: ')
print(sklearn.metrics.f1_score(y_valid, y_pred))
print('Precision: ')
print(sklearn.metrics.precision_score(y_valid, y_pred, pos_label=1))
print('Recall:')
print(sklearn.metrics.recall_score(y_valid, y_pred, pos_label=1))
print('Recall:')

# Save model.
model_GloVe_train.save('GloVe_train_CNN_model.h5')

F1 score: 
0.87790208107783
Precision: 
0.8671577946768061
Recall:
0.8889159561510354
Recall:


In [49]:
#Word2Vec embedding with fixed embedding vectors

model_w2v_fix =define_model(filters=120,   
                              kernel_size=3,
                              embedding_dim=100,
                              dropout_rate=0.2,
                              pool_size=2,
                              embedding_layer= word2vecEmbedding)

model_w2v_fix.summary()

train_model(learning_rate= 0.001,
                epochs=100,
                batch_size=64,
                dropout_rate=0.2,
                model=model_w2v_fix
                )

#evaluate model

model_w2v_fix.evaluate(x_val, y_valid)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 300)          8187000   
_________________________________________________________________
dropout_24 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 98, 120)           108120    
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 96, 120)           43320     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 48, 120)           0         
_________________________________________________________________
dropout_25 (Dropout)         (None, 48, 120)           0         
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 45, 120)           57720     
__________

[0.2666704379032236, 0.8893003]

In [50]:
y_pred = np.around(model_w2v_fix.predict(x_val)).astype(int)

print('F1 score: ')
print(sklearn.metrics.f1_score(y_valid, y_pred))
print('Precision: ')
print(sklearn.metrics.precision_score(y_valid, y_pred, pos_label=1))
print('Recall:')
print(sklearn.metrics.recall_score(y_valid, y_pred, pos_label=1))


# Save model.
model_w2v_fix.save('W2V_fix_CNN_model.h5')

F1 score: 
0.888998886276451
Precision: 
0.903420523138833
Recall:
0.8750304506699147
Recall:


In [51]:
#Word2Vec embedding with trainable embedding vectors

model_w2v_train =define_model(filters=120,   
                              kernel_size=3,
                              embedding_dim=100,
                              dropout_rate=0.2,
                              pool_size=2,
                              embedding_layer= word2vecEmbeddingTrain)

model_w2v_train.summary()

train_model(learning_rate= 0.001,
                epochs=100,
                batch_size=64,
                dropout_rate=0.2,
                model=model_w2v_train
                )

#evaluate model

model_w2v_train.evaluate(x_val, y_valid)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 300)          8187000   
_________________________________________________________________
dropout_27 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 98, 120)           108120    
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 96, 120)           43320     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 48, 120)           0         
_________________________________________________________________
dropout_28 (Dropout)         (None, 48, 120)           0         
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 45, 120)           57720     
__________

[0.2799303570835146, 0.88473403]

In [53]:
y_pred = np.around(model_w2v_train.predict(x_val)).astype(int)

print('F1 score: ')
print(sklearn.metrics.f1_score(y_valid, y_pred))
print('Precision: ')
print(sklearn.metrics.precision_score(y_valid, y_pred, pos_label=1))
print('Recall:')
print(sklearn.metrics.recall_score(y_valid, y_pred, pos_label=1))
print('Recall:')

# Save model.
#model_w2v_train.save('W2V_train_CNN_model.h5')

F1 score: 
0.8864023351982486
Precision: 
0.8851105173670148
Recall:
0.8876979293544458
Recall:


In [54]:
#random embedding with trainable embedding vectors

model_random =define_model(filters=120,   
                              kernel_size=3,
                              embedding_dim=100,
                              dropout_rate=0.2,
                              pool_size=2,
                              embedding_layer= embedding_layer_trainable)

model_random.summary()

train_model(learning_rate= 0.001,
                epochs=100,
                batch_size=64,
                dropout_rate=0.2,
                model=model_random
                )

#evaluate model

model_random.evaluate(x_val, y_valid)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 300)          8187000   
_________________________________________________________________
dropout_30 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
conv1d_40 (Conv1D)           (None, 98, 120)           108120    
_________________________________________________________________
conv1d_41 (Conv1D)           (None, 96, 120)           43320     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 48, 120)           0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 48, 120)           0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 45, 120)           57720     
__________

[0.34440775403752233, 0.8564729]

In [55]:
y_pred = np.around(model_random.predict(x_val)).astype(int)

print('F1 score: ')
print(sklearn.metrics.f1_score(y_valid, y_pred))
print('Precision: ')
print(sklearn.metrics.precision_score(y_valid, y_pred, pos_label=1))
print('Recall:')
print(sklearn.metrics.recall_score(y_valid, y_pred, pos_label=1))
print('Recall:')

# Save model.
#model_random.save('random_CNN_model.h5')

F1 score: 
0.8444147157190636
Precision: 
0.9364985163204748
Recall:
0.7688185140073082
Recall:


# Graphics

In [None]:
##plot model
history_dict = history
history_dict.keys()

model= model_random
import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


In [None]:
from keras.utils import plot_model

#plot_model(model_random, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

#plot_model(model_random, to_file='model.png')
tf.keras.utils.plot_model(
    model_random, 
    to_file='model.png',
    show_shapes=True,
    show_layer_names=True,
    rankdir='TB'
)
