### importing all required libraries

In [1]:
import json
import os
import numpy as np
from tensorflow.keras import models,layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
class sentiment:             # classes for getting sentiments and categories
    positive = 'positive'
    negative = 'negative'           
    neutral = 'neutral'
class Review:
    def __init__(self,text,rating,category):
        self.text = text
        self.rating = rating
        self.category = category
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.rating<=2:
            return sentiment.negative
        elif self.rating==3:
            return sentiment.neutral
        else:
            return sentiment.positive

### loading categories

In [3]:
file_path = './data/category' # file-path
categories = []
for f in os.listdir(file_path):
    categories.append(f)

### loading data in reviews(List) through Review class

In [4]:
reviews = []
for file in os.listdir(file_path):
    path = os.path.join(file_path,file)
    with open(path,'r') as f:
        for line in f:
            review = json.loads(line)
            reviews.append(Review(review['reviewText'],review['overall'],
                                  categories[categories.index(file)].split('_')[0]))

In [5]:
len(reviews)

5000

In [6]:
from sklearn.model_selection import train_test_split # for splitting train and test data
train_data,test_data = train_test_split(reviews,test_size=0.3)

In [7]:
train_texts = [i.text for i in train_data] # get sentences/reviews
test_texts = [i.text for i in test_data]
train_labels = [i.category for i in train_data] # get categories
test_labels = [i.category for i in test_data]

In [8]:
vocab_size=20000
oov_token='##'
max_length=140
embed_dim=32

# Tokenizer class for tokenizing text reviews
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)

In [9]:
tokenizer.fit_on_texts(train_texts)
indexes = tokenizer.word_index
seq = tokenizer.texts_to_sequences(train_texts)
pad = pad_sequences(seq,maxlen=max_length,padding='post',truncating='post')
print(train_texts[0])
print(seq[0])
print(pad[0])

I definitely recommend it to anyone who like to watch videos on the big screen like July Plus, Netflix, YouTube. Is easy to set up and fast the installing.
[3, 238, 129, 7, 5, 386, 117, 27, 5, 371, 1684, 19, 2, 194, 411, 27, 4124, 492, 2439, 2138, 9, 84, 5, 214, 45, 4, 395, 2, 2270]
[   3  238  129    7    5  386  117   27    5  371 1684   19    2  194
  411   27 4124  492 2439 2138    9   84    5  214   45    4  395    2
 2270    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [10]:
from sklearn.preprocessing import LabelEncoder # label encoder which transforms output text labels into integers
encoder = LabelEncoder()
encoder.fit(train_labels)
encoded_Y = encoder.transform(train_labels)

In [11]:
Y = encoder.fit_transform(test_labels)

In [12]:
test_seq = tokenizer.texts_to_sequences(test_texts)
test_pad = pad_sequences(test_seq,maxlen=max_length,padding='post',truncating='post')

In [13]:
from tensorflow.keras.utils import to_categorical # one hot encoding
tr_labels = to_categorical(encoded_Y, 5)
te_labels = to_categorical(Y, 5)

In [14]:
model = models.Sequential() # create model
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_length))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(5,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

In [23]:
model.fit(pad,tr_labels,epochs=15,validation_data=(test_pad,te_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x216dbbb7070>

In [37]:
predictions=model.predict([test_pad]) # predictions

In [38]:
print(np.argmax(predictions[0]))
print(np.argmax(predictions[1]))
print(np.argmax(predictions[2]))
print(np.argmax(predictions[3]))
print(np.argmax(predictions[4]))
print(np.argmax(predictions[7]))

2
0
3
2
2
2


In [39]:
print(test_data[0].category)
print(test_data[1].category)
print(test_data[2].category)
print(test_data[3].category)
print(test_data[4].category)
print(test_data[7].category)

Electronics
Books
Grocery
Electronics
Electronics
Electronics
