In [1]:
# Packages
import nltk
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [2]:
# Plot settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

# Import dataset

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
train = pd.read_csv('train.csv', header=None)
train['Text'] = train[1]+' '+train[2]
train = train.drop(columns=[1,2])

In [5]:
train[0] = train[0].map(lambda x: x-1)
train.rename(columns={0:'Sentiment'},inplace=True)

In [6]:
# Generate a random and balanced small sample for coding 
train_pos = train[train['Sentiment']==1].sample(frac=1)[:2500]
train_neg = train[train['Sentiment']==0].sample(frac=1)[:2500]

train_sp = pd.concat([train_pos, train_neg], axis=0, ignore_index=True)
train_sp = train_sp.sample(frac=1).reset_index(drop=True)

In [7]:
train_sp.tail()

Unnamed: 0,Sentiment,Text
4995,1,What version of Gobe Productive BeOS operating...
4996,0,AWFUL BELLY DANCING! Amira Mor is a disgrace t...
4997,1,Now this is a classic screw all the family-ori...
4998,0,Hunk-O-Junk I agree with those who've said the...
4999,1,Excellent!!! Tiffen 67mm Hollywood FX Glamour ...


In [8]:
train_sp['Sentiment'].value_counts()

1    2500
0    2500
Name: Sentiment, dtype: int64

# Text Processing

In [9]:
# 1. Tokenisation (casual module)
from nltk.tokenize import TweetTokenizer
Tokenizer = TweetTokenizer()
# 2. Remove punctuation
import string
# 3. Remove stopwords
from nltk.corpus import stopwords
# 4. Stemming
from nltk.stem.porter import PorterStemmer

In [10]:
def process_text(text):
    text = str(text)
    tokenized = Tokenizer.tokenize(text)
    tokenized_no_punctuation = [word.lower() for word in tokenized if word not in string.punctuation]
    tokenized_no_stopwords = [word for word in tokenized_no_punctuation if word not in stopwords.words('english')]
    token = [PorterStemmer().stem(word) for word in tokenized_no_stopwords if word != '️']
    token = ' '.join(token)
    return token

In [103]:
def process_text_ez(text):
    text = str(text)
    tokenized = Tokenizer.tokenize(text)
    tokenized_lower = [word.lower() for word in tokenized]
    tokenized_stem = [PorterStemmer().stem(word) for word in tokenized_lower if word != '']
    token = ' '.join(tokenized_stem)
    return token

In [11]:
train_sp['Token'] = train_sp['Text'].apply(process_text)

In [104]:
train_sp['Text'] = train_sp['Text'].apply(process_text_ez)

In [105]:
train_sp.tail()

Unnamed: 0,Sentiment,Text,Token
4995,1,what version of gobe product beo oper system i...,version gobe product beo oper system great fas...
4996,0,aw belli danc ! amira mor is a disgrac to the ...,aw belli danc amira mor disgrac art belli danc...
4997,1,now thi is a classic screw all the family-orie...,classic screw family-orient garbag taint movi ...
4998,0,hunk-o-junk i agre with those who'v said the b...,hunk-o-junk agre who'v said batteri life phone...
4999,1,excel ! ! ! tiffen 67mm hollywood fx glamour f...,excel tiffen 67mm hollywood fx glamour filter ...


In [106]:
# Randomly split indexes 80/20
train_x, valid_x, train_y, valid_y = train_test_split(train_sp['Token'], train_sp['Sentiment'], train_size=0.8, random_state=1, stratify=train_sp.Sentiment)

## Count Vectors (BoW) as features

In [107]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 1-gram
count_vect_1n = CountVectorizer(analyzer='word', ngram_range=(1,1), token_pattern=r'\w{1,}', max_features=5000)
count_vect_1n.fit(train_sp['Token'])

# 2-gram
count_vect_2n = CountVectorizer(analyzer='word', ngram_range=(2,2), token_pattern=r'\w{1,}', max_features=5000)
count_vect_2n.fit(train_sp['Token'])

# 3-gram
count_vect_3n = CountVectorizer(analyzer='word', ngram_range=(3,3), token_pattern=r'\w{1,}', max_features=5000)
count_vect_3n.fit(train_sp['Token'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [108]:
# Transform the training and validation data using count vectorizer object
xtrain_count_1n =  count_vect_1n.transform(train_x)
xvalid_count_1n =  count_vect_1n.transform(valid_x)

xtrain_count_2n =  count_vect_2n.transform(train_x)
xvalid_count_2n =  count_vect_2n.transform(valid_x)

xtrain_count_3n =  count_vect_3n.transform(train_x)
xvalid_count_3n =  count_vect_3n.transform(valid_x)

## TF-IDF Vectors + N-gram as features

In [109]:
# 1-gram
tfidf_vect_1n = TfidfVectorizer(analyzer='word', ngram_range=(1,1), token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect_1n.fit(train_sp['Token'])

# 2-gram
tfidf_vect_2n = TfidfVectorizer(analyzer='word', ngram_range=(2,2), token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect_2n.fit(train_sp['Token'])

# 3-gram
tfidf_vect_3n = TfidfVectorizer(analyzer='word', ngram_range=(3,3), token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect_3n.fit(train_sp['Token'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(3, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [110]:
xtrain_tfidf_1n =  tfidf_vect_1n.transform(train_x)
xvalid_tfidf_1n =  tfidf_vect_1n.transform(valid_x)

xtrain_tfidf_2n =  tfidf_vect_2n.transform(train_x)
xvalid_tfidf_2n =  tfidf_vect_2n.transform(valid_x)

xtrain_tfidf_3n =  tfidf_vect_3n.transform(train_x)
xvalid_tfidf_3n =  tfidf_vect_3n.transform(valid_x)

## Word2vec as features

In [111]:
from gensim.models import word2vec

In [112]:
sentence = [(i.split()) for i in train_sp.Text]
w2vmodel = word2vec.Word2Vec(sentence, min_count=1, size=300, workers=4)

In [113]:
w2vmodel.most_similar(['book'])

[('review', 0.9352136850357056),
 ('movi', 0.9123331308364868),
 ('one', 0.911530613899231),
 ('cd', 0.9013593792915344),
 ('dvd', 0.8915457725524902),
 ('album', 0.8906834721565247),
 ('teleplay', 0.8861536979675293),
 ('film', 0.8826782703399658),
 ('game', 0.8803355693817139),
 ('caliber.th', 0.8781176805496216)]

In [114]:
train_w2v_x, valid_w2v_x, train_w2v_y, valid_w2v_y = train_test_split(train_sp['Text'], train_sp['Sentiment'], train_size=0.8, random_state=1, stratify=train_sp.Sentiment)

In [115]:
def get_sent_vec(size, sent, model):
    vec = np.zeros(size).reshape(1,size)
    count = 0
    for word in sent:
        try:
            vec += model[word].reshape(1,size)
            count += 1
        except:
            continue
    if count != 0:
        vec /= count
    return vec

def get_train_vec(train_x, valid_x, model):
    train_vec = np.concatenate([get_sent_vec(300, sent, model) for sent in train_x])
    test_vec = np.concatenate([get_sent_vec(300, sent, model) for sent in valid_x])
    return train_vec, test_vec

In [125]:
# Lower + No punctuation + No stopwords+ Stemming
train_vec, valid_vec = get_train_vec(train_x, valid_x, w2vmodel)
# Lower words + Stemming
train_w2v_vec, valid_w2v_vec = get_train_vec(train_w2v_x, valid_w2v_x, w2vmodel)

# Model Training (Traditional ML)

In [120]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, confusion_matrix, roc_curve
from statlearning import plot_confusion_matrix

In [121]:
from sklearn import naive_bayes, svm, linear_model

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    trm = classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = trm.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    print('Accuracy:', accuracy_score(predictions, valid_y).round(4))
    print('Precision:', precision_score(predictions, valid_y).round(4))
    print('Recall:', recall_score(predictions, valid_y).round(4))
    print('F1 Score:', f1_score(predictions, valid_y).round(4))
    '''
    confusion = confusion_matrix(predictions, valid_y)
    fig, ax = plt.subplots(figsize=(8,6))
    plot_confusion_matrix(confusion, classes=['negative','positive'], normalize=True)
    plt.show()
    '''
    #print('AUC:', roc_auc_score(predictions, proba_y[:,1]).round(3),'\n')

##  Naive Bayes

In [122]:
print("# Naive Bayes + Count Vectors + 1-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_count_1n, train_y, xvalid_count_1n)

print("\n# Naive Bayes + Count Vectors + 2-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_count_2n, train_y, xvalid_count_2n)

print("\n# Naive Bayes + Count Vectors + 3-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_count_3n, train_y, xvalid_count_3n)

# Naive Bayes + Count Vectors + 1-gram
Accuracy: 0.836
Precision: 0.87
Recall: 0.8146
F1 Score: 0.8414

# Naive Bayes + Count Vectors + 2-gram
Accuracy: 0.753
Precision: 0.82
Recall: 0.7231
F1 Score: 0.7685

# Naive Bayes + Count Vectors + 3-gram
Accuracy: 0.609
Precision: 0.85
Recall: 0.5735
F1 Score: 0.6849


In [123]:
print("# Naive Bayes + TF-IDF + 1-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)

print("\n# Naive Bayes + TF-IDF + 2-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_2n, train_y, xvalid_tfidf_2n)

print("\n# Naive Bayes + TF-IDF + 3-gram")
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)

# Naive Bayes + TF-IDF + 1-gram
Accuracy: 0.836
Precision: 0.87
Recall: 0.8146
F1 Score: 0.8414

# Naive Bayes + TF-IDF + 2-gram
Accuracy: 0.753
Precision: 0.82
Recall: 0.7231
F1 Score: 0.7685

# Naive Bayes + TF-IDF + 3-gram
Accuracy: 0.609
Precision: 0.85
Recall: 0.5735
F1 Score: 0.6849


In [128]:
print("# Naive Bayes + 1234 + Word2vec")
train_model(naive_bayes.BernoulliNB(), train_vec, train_y, valid_vec)

print("\n# Naive Bayes + 14 + Word2vec")
train_model(naive_bayes.BernoulliNB(), train_w2v_vec, train_y, valid_w2v_vec)

# Naive Bayes + 1234 + Word2vec
Accuracy: 0.549
Precision: 0.562
Recall: 0.5478
F1 Score: 0.5548

# Naive Bayes + 14 + Word2vec
Accuracy: 0.545
Precision: 0.428
Recall: 0.5587
F1 Score: 0.4847


## SVM 

In [36]:
print("SVM + Count Vectors + 1-gram")
train_model(svm.SVC(verbose=True), xtrain_count_1n, train_y, xvalid_count_1n)

print("\nSVM + Count Vectors + 2-gram")
train_model(svm.SVC(verbose=True), xtrain_count_2n, train_y, xvalid_count_2n)

print("\nSVM + Count Vectors + 3-gram")
train_model(svm.SVC(verbose=True), xtrain_count_3n, train_y, xvalid_count_3n)

SVM + Count Vectors + 1-gram
[LibSVM]Accuracy: 0.772
Precision: 0.798
Recall: 0.7586
F1 Score: 0.7778

SVM + Count Vectors + 2-gram
[LibSVM]Accuracy: 0.538
Precision: 0.972
Recall: 0.5203
F1 Score: 0.6778

SVM + Count Vectors + 3-gram
[LibSVM]Accuracy: 0.535
Precision: 0.97
Recall: 0.5187
F1 Score: 0.676


In [28]:
print("SVM + TF-IDF + 1-gram")
train_model(svm.SVC(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)

print("\nSVM + TF-IDF + 2-gram")
train_model(svm.SVC(), xtrain_tfidf_2n, train_y, xvalid_tfidf_2n)

print("\nSVM + TF-IDF + 3-gram")
train_model(svm.SVC(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)

SVM + TF-IDF + 1-gram
Accuracy: 0.798
Precision: 0.914
Recall: 0.7419
F1 Score: 0.819

SVM + TF-IDF + 2-gram
Accuracy: 0.556
Precision: 0.986
Recall: 0.5301
F1 Score: 0.6895

SVM + TF-IDF + 3-gram
Accuracy: 0.513
Precision: 0.996
Recall: 0.5066
F1 Score: 0.6716


In [129]:
print("SVM + 1234 + Word2vec")
train_model(svm.SVC(), train_vec, train_y, valid_vec)

print("\nSVM + 14 + Word2vec")
train_model(svm.SVC(), train_w2v_vec, train_y, valid_w2v_vec)

SVM + 1234 + Word2vec
Accuracy: 0.553
Precision: 0.33
Recall: 0.5957
F1 Score: 0.4247

SVM + 14 + Word2vec
Accuracy: 0.514
Precision: 0.094
Recall: 0.5875
F1 Score: 0.1621


## Logistic Regression 

In [29]:
print("Logistic Regression + Count Vectors + 1-gram")
train_model(linear_model.LogisticRegression(), xtrain_count_1n, train_y, xvalid_count_1n)

print("\nLogistic Regression + Count Vectors + 2-gram")
train_model(linear_model.LogisticRegression(), xtrain_count_2n, train_y, xvalid_count_2n)

print("\nLogistic Regression + Count Vectors + 3-gram")
train_model(linear_model.LogisticRegression(), xtrain_count_3n, train_y, xvalid_count_3n)

Logistic Regression + Count Vectors + 1-gram
Accuracy: 0.833
Precision: 0.846
Recall: 0.8246
F1 Score: 0.8351

Logistic Regression + Count Vectors + 2-gram
Accuracy: 0.74
Precision: 0.76
Recall: 0.7308
F1 Score: 0.7451

Logistic Regression + Count Vectors + 3-gram
Accuracy: 0.619
Precision: 0.838
Recall: 0.5828
F1 Score: 0.6874


In [30]:
print("Logistic Regression + TF-IDF + 1-gram")
train_model(linear_model.LogisticRegression(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)

print("\nLogistic Regression + TF-IDF + 2-gram")
train_model(linear_model.LogisticRegression(), xtrain_tfidf_2n, train_y, xvalid_tfidf_3n)

print("\nLogistic Regression + TF-IDF + 3-gram")
train_model(linear_model.LogisticRegression(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)

Logistic Regression + TF-IDF + 1-gram
Accuracy: 0.856
Precision: 0.862
Recall: 0.8518
F1 Score: 0.8569

Logistic Regression + TF-IDF + 2-gram
Accuracy: 0.536
Precision: 0.696
Recall: 0.5273
F1 Score: 0.6

Logistic Regression + TF-IDF + 3-gram
Accuracy: 0.617
Precision: 0.834
Recall: 0.5816
F1 Score: 0.6853


In [130]:
print("Logistic Regression + 1234 + Word2vec")
train_model(linear_model.LogisticRegression(), train_vec, train_y, valid_vec)

print("\nLogistic Regression + 14 + Word2vec")
train_model(linear_model.LogisticRegression(), train_w2v_vec, train_y, valid_w2v_vec)

Logistic Regression + 1234 + Word2vec
Accuracy: 0.551
Precision: 0.59
Recall: 0.5473
F1 Score: 0.5679

Logistic Regression + 14 + Word2vec
Accuracy: 0.584
Precision: 0.586
Recall: 0.5837
F1 Score: 0.5848


# Model Training (DNN)

In [205]:
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

## CNN

In [233]:
MAX_SEQUENCE_LENGTH = 128 
EMBEDDING_DIM = 300 
VALIDATION_SPLIT = 0.15
TEST_SPLIT = 0.2

In [207]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

# One-hot
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sp['Token'])
sequences = tokenizer.texts_to_sequences(train_sp['Token'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(train_sp['Sentiment']))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 18340 unique tokens.
Shape of data tensor: (5000, 128)
Shape of label tensor: (5000, 2)


In [208]:
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print('train rows: '+str(len(x_train)))
print('valid rows: '+str(len(x_val)))
print('test rows: '+str(len(x_test)))

train rows: 3249
valid rows: 751
test rows: 1000


In [209]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

cnn = Sequential()
cnn.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
cnn.add(Dropout(0.2))
cnn.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
cnn.add(MaxPooling1D(3))
cnn.add(Flatten())
cnn.add(Dense(EMBEDDING_DIM, activation='relu'))
cnn.add(Dense(labels.shape[1], activation='softmax'))
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_37 (Embedding)     (None, 128, 300)          5502300   
_________________________________________________________________
dropout_28 (Dropout)         (None, 128, 300)          0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 126, 250)          225250    
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 42, 250)           0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 10500)             0         
_________________________________________________________________
dense_57 (Dense)             (None, 300)               3150300   
_________________________________________________________________
dense_58 (Dense)             (None, 2)                 602       
Total para

In [210]:
# CNN + One-hot
cnn.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
cnn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
cnn.evaluate(x_test, y_test)

Train on 3249 samples, validate on 751 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.655867104113102, 0.8]

In [211]:
# Word2vec
from keras.utils import plot_model
from keras.layers import Embedding

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if str(word) in w2vmodel:
        embedding_matrix[i] = np.asarray(w2vmodel[str(word)],dtype='float32')
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [212]:
# CNN + Word2vec
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

cnn_w2v = Sequential()
cnn_w2v.add(embedding_layer)
cnn_w2v.add(Dropout(0.2))
cnn_w2v.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
cnn_w2v.add(MaxPooling1D(3))
cnn_w2v.add(Flatten())
cnn_w2v.add(Dense(EMBEDDING_DIM, activation='relu'))
cnn_w2v.add(Dense(labels.shape[1], activation='softmax'))
cnn_w2v.summary()

cnn_w2v.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
cnn_w2v.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
cnn_w2v.evaluate(x_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_38 (Embedding)     (None, 128, 300)          5502300   
_________________________________________________________________
dropout_29 (Dropout)         (None, 128, 300)          0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 126, 250)          225250    
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 42, 250)           0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 10500)             0         
_________________________________________________________________
dense_59 (Dense)             (None, 300)               3150300   
_________________________________________________________________
dense_60 (Dense)             (None, 2)                 602       
Total para

[0.6826513829231262, 0.604]

## LSTM 

In [213]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, MaxPooling1D, Embedding
from keras.models import Sequential

lstm = Sequential()
lstm.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
lstm.add(Dropout(0.2))
lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm.add(Dense(1, activation='sigmoid'))
lstm.add(Dense(EMBEDDING_DIM, activation='relu'))
lstm.add(Dense(labels.shape[1], activation='softmax'))
lstm.summary()

lstm.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
lstm.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
lstm.evaluate(x_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_39 (Embedding)     (None, 128, 300)          5502300   
_________________________________________________________________
dropout_30 (Dropout)         (None, 128, 300)          0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 128)               219648    
_________________________________________________________________
dense_61 (Dense)             (None, 1)                 129       
_________________________________________________________________
dense_62 (Dense)             (None, 300)               600       
_________________________________________________________________
dense_63 (Dense)             (None, 2)                 602       
Total params: 5,723,279
Trainable params: 5,723,279
Non-trainable params: 0
_________________________________________________________________


[0.49554140675067904, 0.807]

In [234]:
# Word2vec
from keras.utils import plot_model
from keras.layers import Embedding

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if str(word) in w2vmodel:
        embedding_matrix[i] = np.asarray(w2vmodel[str(word)],dtype='float32')
        
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)


In [236]:
# CNN + Word2vec
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, MaxPooling1D, Embedding
from keras.models import Sequential

lstm_w2v = Sequential()
lstm_w2v.add(embedding_layer)
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_w2v.add(Dense(1, activation='sigmoid'))
lstm_w2v.add(Dense(EMBEDDING_DIM, activation='relu'))
lstm_w2v.add(Dense(labels.shape[1], activation='softmax'))
lstm_w2v.summary()

lstm_w2v.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
lstm_w2v.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
lstm_w2v.evaluate(x_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_43 (Embedding)     (None, 128, 300)          5502300   
_________________________________________________________________
dropout_42 (Dropout)         (None, 128, 300)          0         
_________________________________________________________________
lstm_33 (LSTM)               (None, 128)               219648    
_________________________________________________________________
dense_64 (Dense)             (None, 1)                 129       
_________________________________________________________________
dense_65 (Dense)             (None, 300)               600       
_________________________________________________________________
dense_66 (Dense)             (None, 2)                 602       
Total params: 5,723,279
Trainable params: 220,979
Non-trainable params: 5,502,300
____________________________________________________________

[0.6435657901763916, 0.654]

## Hybrid Model

In [271]:
# CNN + LSTM
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Conv1D, MaxPooling1D, Embedding, TimeDistributed
from keras.models import Sequential

hybrid = Sequential()
hybrid.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
hybrid.add(Dropout(0.2))
hybrid.add(Conv1D(250, 3, activation='relu'))
hybrid.add(MaxPooling1D(pool_size=3))
hybrid.add(LSTM(128))
hybrid.add(Dense(2, activation='sigmoid'))
hybrid.add(Dense(EMBEDDING_DIM, activation='relu'))
hybrid.add(Dense(labels.shape[1], activation='softmax'))
hybrid.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
hybrid.summary()

hybrid.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
hybrid.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
hybrid.evaluate(x_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_78 (Embedding)     (None, 128, 300)          5502300   
_________________________________________________________________
dropout_72 (Dropout)         (None, 128, 300)          0         
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 126, 250)          225250    
_________________________________________________________________
max_pooling1d_42 (MaxPooling (None, 42, 250)           0         
_________________________________________________________________
lstm_58 (LSTM)               (None, 128)               194048    
_________________________________________________________________
dense_73 (Dense)             (None, 2)                 258       
_________________________________________________________________
dense_74 (Dense)             (None, 300)               900       
__________

[0.5607659213542938, 0.818]

# Optimisation (CV: Parameters & Features Selection)

# Reflection