In [60]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from keras.preprocessing import sequence, text
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, BatchNormalization, Activation, Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.utils import plot_model
np.random.seed(1)

In [61]:
df = pd.read_excel('1-100.xlsx',encoding='utf-8')
df_review_for_vocab = pd.read_csv('Tagged_Data_Values.csv',encoding='utf-8')
df_review_for_vocab = df_review_for_vocab[:5000]
df_review_for_vocab = df_review_for_vocab.dropna()
maxlen = 50
batch_size = 128

In [62]:
tok = text.Tokenizer(num_words=200000)
tok.fit_on_texts(df_review_for_vocab['review_body'].tolist())
x = tok.texts_to_sequences(df['review_body'])
x = sequence.pad_sequences(x, maxlen=maxlen)
y = df['tag']
encoder = LabelEncoder()
encoder.fit(y)
y = encoder.transform(y)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
word_index = tok.word_index

In [63]:
embeddings_index = {}
f = open('glove.840B.300d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        pass
    embeddings_index[word] = coefs
f.close()

In [64]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [70]:
model1 = Sequential()
model1.add(Embedding(len(word_index) + 1,300,weights=[embedding_matrix],input_length=maxlen,trainable=True))
model1.add(Dropout(0.6))
model1.add(Bidirectional(LSTM(150,recurrent_dropout=0.6)))
model1.add(Dropout(0.6))
model1.add(Dense(1, activation='sigmoid'))
model1.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model1_history = model1.fit(x_train, y_train, batch_size=batch_size, epochs=20,
                            validation_split=0.1)
score1, acc1 = model1.evaluate(x_test, y_test,
                               batch_size=batch_size)
print('Test accuracy for BiLSTM+Glove Model is:', acc1)
y_pred1 = model1.predict(x_test)
y_pred1 = (y_pred1 > 0.5)
print(classification_report(y_test, y_pred1))

Train on 72 samples, validate on 8 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy for BiLSTM+Glove Model is: 0.8500000238418579
              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       0.75      1.00      0.86         9

    accuracy                           0.85        20
   macro avg       0.88      0.86      0.85        20
weighted avg       0.89      0.85      0.85        20



In [66]:
model2 = Sequential()
model2.add(Embedding(len(word_index) + 1,100,input_length=maxlen))
model2.add(Dropout(0.2))
model2.add(Conv1D(100,5,padding='valid',activation='relu',strides=2))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(100, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation='sigmoid'))
model2.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model2_history = model2.fit(x_train, y_train, batch_size=batch_size, epochs=19,
                            validation_split=0.1)
score2, acc2 = model2.evaluate(x_test, y_test, batch_size=batch_size)
print('Test accuracy for CNN+Dense Model is:', acc2)
y_pred2 = model2.predict(x_test)
y_pred2 = (y_pred2 > 0.5)
print(classification_report(y_test, y_pred2))

Train on 72 samples, validate on 8 samples
Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19
Test accuracy for CNN+Dense Model is: 0.8500000238418579
              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       0.75      1.00      0.86         9

    accuracy                           0.85        20
   macro avg       0.88      0.86      0.85        20
weighted avg       0.89      0.85      0.85        20



In [67]:
file_in = "Tagged_Data_Values.csv"
df_unlabeled = pd.read_csv(file_in)
df_unlabeled = df_unlabeled['review_body']
df_unlabeled = df_unlabeled.dropna()
df_unlabeled = df_unlabeled[1000:5000]
df_unlabeled = df_unlabeled.reset_index()
df_unlabeled = df_unlabeled.drop(columns = ['index'])

print(file_in)
print(df_unlabeled.dtypes)
print("Sample size:", len(df))
print(df_unlabeled.head(5))

Tagged_Data_Values.csv
review_body    object
dtype: object
Sample size: 100
                                         review_body
0                Love it works awesome on my Sony A7
1  I would not to recommend to anybody to but the...
2  This bag has good cushioning and the insert in...
3  While this camera seems pretty good, please be...
4  This let me add 3 additional lens from an old ...


In [71]:
x_unlabeled = tok.texts_to_sequences(df_unlabeled['review_body'])
x_unlabeled = sequence.pad_sequences(x_unlabeled, maxlen=maxlen)

result1 = model1.predict(x_unlabeled)
result2 = model2.predict(x_unlabeled)

result1 = (result1 > 0.5)
result2 = (result2 > 0.5)

In [72]:
difference = []
for i in range(len(result1)):
    if result1[i] != result2[i]:
        difference.append(i)
len(difference)


159

In [75]:
df_unlabeled = df_unlabeled.iloc[difference,]

In [76]:
df_unlabeled.to_excel('combat_unlabeled.xlsx',index=False)

In [87]:
df_unlabeled.head().review_body.to_list()

['Like many other reviewers, I must say you get what you pay for.  The battery works, and keeps my laptop charged.  Even after only having it for a few months the charge rarely lasts more than an hour and a half (in power save mode).',
 'This was probably the best camera that I have ever purchased.  I am totally pleased with the pictures that were taken on it while on vacation.  This camera is very easy to use even for my tech challenged wife lol.  The pictures were clear and sharp',
 'Like the case and it fits my camera body with 55-200 attached lense. However, manufacture should reconsider placement of shoulder strap.  It does not hang flat against your body and is a little awkward.  But, I am keeping it because it is light and stylish.',
 'Wonderful fit, for what I require in a product and I think this product is worth the price and when it comes to saving, Amazon is GREAT ON A SCALE 1-10 the product rate as a 10, highly recommend, overall this is a great product.',
 'This suction m

In [47]:
x_unlabeled = tok.texts_to_sequences(df_unlabeled['review_body'])
len(x_unlabeled)

1000