In [2]:
'''
    CNN implementation
    author: @abdulsmapara
'''

# import statements
import spacy
import pandas as pd
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import gensim

nlp = spacy.load('en_core_web_sm')

Using TensorFlow backend.


In [3]:
'''
DEFINE CNN
'''

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [21]:
def preprocessing(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    text = nlp(text_nopunct.lower())
    final_text = ""
    for token in text:
        if token.is_stop == False:
            final_text += token.text + " "
    return final_text.strip()

def get_tokens(text):
    text = nlp(text)
    tokens = []
    for token in text:
        tokens.append(token.text)
    return tokens


word2vec = gensim.models.KeyedVectors.load('custom_word2vec.model')
data = pd.read_csv('labelled_news.csv')

pos = []
neg = []
neu = []
for l in data.LABEL:
    if l == "POS":
        pos.append(1)
        neg.append(0)
        neu.append(0)
    elif l == "NEG":
        pos.append(0)
        neg.append(1)
        neu.append(0)
    else:
        pos.append(0)
        neg.append(0)
        neu.append(1)
data['POS'] = pos
data['NEG'] = neg
data['NEU'] = neu


data['Text_final'] = data['SENTENCE'].apply(lambda x: preprocessing(x))
data['tokens'] = data['Text_final'].apply(lambda x: get_tokens(x))
print(data[:4])

data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))


all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

MAX_SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_final"].tolist())

train_word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.zeros(EMBEDDING_DIM)
# print(train_embedding_weights.shape)
test_sequences = tokenizer.texts_to_sequences(data_test["Text_final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

label_names = ['POS', 'NEG', 'NEU']
y_train = data_train[label_names].values
x_train = train_cnn_data
y_tr = y_train
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))
num_epochs = 15
batch_size = 64
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0, shuffle=True, batch_size=batch_size)
predictions = model.predict(test_cnn_data, batch_size=64, verbose=1)
model.save("cnn_model.model")
labels = ['POS','NEG','NEU']
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
print("ACCURACY: ", sum(data_test.LABEL==prediction_labels)/len(prediction_labels))
print(data_test.LABEL.value_counts())


   Unnamed: 0                                           SENTENCE LABEL  POS  \
0           0  BBC News: Labour MPs pass Corbyn no-confidence...   POS    1   
1           1  2,500 Scientists To Australia: If You Want To ...   POS    1   
2           2  Today The United Kingdom decides whether to re...   NEU    0   
3           3  Canadian Rescue Plane successfully reaches Sou...   POS    1   

   NEG  NEU                                         Text_final  \
0    0    0  bbc news labour mps pass corbyn noconfidence m...   
1    0    0  2500 scientists australia want save great barr...   
2    0    1  today united kingdom decides remain european u...   
3    0    0  canadian rescue plane successfully reaches sou...   

                                              tokens  
0  [bbc, news, labour, mps, pass, corbyn, noconfi...  
1  [2500, scientists, australia, want, save, grea...  
2  [today, united, kingdom, decides, remain, euro...  
3  [canadian, rescue, plane, successfully, reache... 



Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 40, 300)      2088900     input_17[0][0]                   
__________________________________________________________________________________________________
conv1d_81 (Conv1D)              (None, 39, 200)      120200      embedding_17[0][0]               
__________________________________________________________________________________________________
conv1d_82 (Conv1D)              (None, 38, 200)      180200      embedding_17[0][0]               
___________________________________________________________________________________________