In [3]:
import pandas as pd
import numpy as np
import nltk
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [4]:
# read csv dataset then edit

In [5]:

df = pd.read_csv("googleplaystore_user_reviews.csv", na_values="nan")
df = df.dropna(subset=['App','Translated_Review','Sentiment'], how ='any')
df['Sentiment'] = df['Sentiment'].replace(['Positive'],'1')
df['Sentiment'] = df['Sentiment'].replace(['Negative'],'0')
df['Sentiment'] = df['Sentiment'].replace(['Neutral'],'1')


In [6]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# reviews are categorized as lines

In [7]:
review_lines = list()
lines = df['Translated_Review'].values.tolist()
print (len(lines))

37427


# tokenization and removing punctuation and stop words

In [8]:
for line in lines :
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table =str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)
len(review_lines)
#print(review_lines)
    

37427

# word2vec model

In [9]:
import gensim

model = gensim.models.Word2Vec(sentences=review_lines,size=100,window = 5,workers =4,min_count=1)
words = list(model.wv.vocab)
print('total word: %d' %len(words))



total word: 21481


# saving the model

In [10]:
filename = 'r.txt'
model.wv.save_word2vec_format(filename,binary=False)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# word embedding as a directory of words to vectors

In [11]:
import os
embeddings_index = {}
f = open(os.path.join('','r.txt'),encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word]=coefs
f.close()


# converting the word embedding into tokenized vector

In [12]:
tk = Tokenizer()
tk.fit_on_texts(review_lines)
sequences = tk.texts_to_sequences(review_lines)
word_index = tk.word_index
print("found %s unique tokens " % len(word_index))
review_pad = pad_sequences(sequences,maxlen=100)
sentiment = df['Sentiment'].values
print('Shape of review ', review_pad.shape)
print('shape of senti' , sentiment.shape)

found 21481 unique tokens 
Shape of review  (37427, 100)
shape of senti (37427,)


# map embeddings from the loaded word2vec model for each word 

In [13]:
 num_words = len(word_index) + 1
embedd = np.zeros((num_words,100))

for word , i in word_index.items():
    if i > num_words:
        continue
    embedd_vec = embeddings_index.get(word)
    if embedd_vec is not None:
        embedd[i] = embedd_vec     
print(num_words)

21482


In [14]:
#training params
batch_size = 64 
num_epochs = 10

#model parameters
num_filters = 64 
embed_dim = 100

# embedding matrix as input to the Embedding layer

In [54]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
print("training CNN ...")
model = Sequential()
model.add(Embedding(num_words, embed_dim,
          weights=[embedd], input_length=100, trainable=False))
model.add(Conv1D(num_filters, 7, activation='sigmoid', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='sigmoid', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='sigmoid', padding='same'))
model.add(Conv1D(num_filters, 7, activation='sigmoid', padding='same'))
model.add(Conv1D(num_filters, 7, activation='sigmoid', padding='same'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

training CNN ...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          2148200   
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 100, 64)           44864     
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 50, 64)            0         
_________________________________________________________________
conv1d_40 (Conv1D)           (None, 50, 64)            28736     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 25, 64)            0         
_________________________________________________________________
conv1d_41 (Conv1D)           (None, 25, 64)            28736     
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 25, 64)            2873

# training the sentiment classification model

In [55]:

VALIDATION_SPLIT = 0.2

indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation = int (VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation]
y_train = sentiment[:-num_validation]
X_test_pad = review_pad[-num_validation:]
y_test = sentiment[-num_validation:]



print('shape of X_train_pad ', X_train_pad.shape)
print('shape of y_train ', y_train.shape)

print('shape of X_test_pad ', X_test_pad.shape)
print('shape of y_train ', y_test.shape)


shape of X_train_pad  (29942, 100)
shape of y_train  (29942,)
shape of X_test_pad  (7485, 100)
shape of y_train  (7485,)


# training the classification model on train and validation test set

In [56]:
model.fit(X_train_pad,y_train,batch_size=64,epochs=10,validation_data= (X_test_pad,y_test),verbose=2)
scores = model.evaluate(X_test_pad, y_test, verbose=0)


Train on 29942 samples, validate on 7485 samples
Epoch 1/10
 - 36s - loss: 0.1657 - acc: 0.7771 - val_loss: 0.1454 - val_acc: 0.7848
Epoch 2/10
 - 34s - loss: 0.1384 - acc: 0.7952 - val_loss: 0.1251 - val_acc: 0.8138
Epoch 3/10
 - 34s - loss: 0.1260 - acc: 0.8206 - val_loss: 0.1171 - val_acc: 0.8322
Epoch 4/10
 - 35s - loss: 0.1194 - acc: 0.8306 - val_loss: 0.1146 - val_acc: 0.8366
Epoch 5/10
 - 35s - loss: 0.1147 - acc: 0.8384 - val_loss: 0.1135 - val_acc: 0.8362
Epoch 6/10
 - 34s - loss: 0.1108 - acc: 0.8446 - val_loss: 0.1082 - val_acc: 0.8481
Epoch 7/10
 - 34s - loss: 0.1084 - acc: 0.8476 - val_loss: 0.1084 - val_acc: 0.8470
Epoch 8/10
 - 35s - loss: 0.1034 - acc: 0.8584 - val_loss: 0.1179 - val_acc: 0.8305
Epoch 9/10
 - 34s - loss: 0.1017 - acc: 0.8590 - val_loss: 0.1030 - val_acc: 0.8564
Epoch 10/10
 - 34s - loss: 0.0992 - acc: 0.8641 - val_loss: 0.1025 - val_acc: 0.8569


# printing accuracy

In [57]:
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 85.69%


# Testing sample dataset

In [62]:
test_sample1="just loving it"
test_sample2="no comments"
test_sample3="totally bad"


test_samples = [test_sample1,test_sample2,test_sample3]
test_samples_tokens = tk.texts_to_sequences(test_samples)

pad =pad_sequences(test_samples_tokens,maxlen=100)

model.predict(x =pad)


array([[0.9669348],
       [0.6705882],
       [0.1296682]], dtype=float32)