In [2]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras import backend as K
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from collections import defaultdict
import math
from sklearn.metrics import precision_recall_fscore_support,accuracy_score

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
tf.random.set_seed(1)
np.random.seed(1)
np.set_printoptions(threshold=np.inf)

max_len=50
NUMBER_CLASSES = 5
EMBDIM=300
hidden_layer_count=1
hidden_layer_numunits=[256]
embedding_file='../cc.en.300.vec'
# embedding_file='../GoogleNews-vectors-negative300.bin'
activation_name='relu'
binary=True
fastText=1

In [4]:
word_to_index = defaultdict(int)
f= open('vocab.csv','r')
lines=f.readlines()
f.close()
for line in lines:
    words=line.split(',')
    word_to_index[words[0]]=int(words[1].rstrip())

# embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=False)


In [5]:
embeddings=defaultdict(lambda:np.zeros((1,EMBDIM)))
f=open(embedding_file, 'r')
lines=f.readlines()
f.close()
for line in lines[fastText:]:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings[word] = vector

In [25]:
def convert_to_lower(text):
    return [i.lower() for i in text]

def remove_punctuation(text):
#     return [i.translate(str.maketrans(dict.fromkeys(string.punctuation))) for i in text]
    return [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for i in text]

In [7]:
# def encode_data(text):
    
#     m=len(text)
#     text_encoded = np.zeros((m,max_len))
#     for i in range(m):
#         sentence_words = text[i]
#         j = 0
#         for w in sentence_words:
#             text_encoded[i, j] = word_to_index.get(w)
#             j = j + 1

            
#     return text_encoded


In [8]:
def encode_data(text):
#         print('idx',idx)
    m=len(text)
    text_encoded = np.zeros((m,max_len))
    for i in range(m):
        sentence_words = text[i]
        j = 0
        for w in sentence_words:
            text_encoded[i, j] = word_to_index.get(w)
            j = j + 1
    return text_encoded
        
        
    

In [9]:
def remove_stopwords(text):
    newtext = []
    f=open('stopwords.txt','r')
    stopwordList=f.readlines()
    stopwordList=[st.strip() for st in stopwordList]
    f.close()
#     stopwordList = (stopwords.words('english'))
#     stopwordList=set(remove_punctuation(stopwordList))
    #print('-----', stopwordList, '---------')
    for tokens in text:
        newtext.append([w for w in tokens if not w in stopwordList])
    #print(newtext[0])
    return newtext

In [10]:
len(word_to_index)

16599

In [11]:
def perform_tokenization(text):
    return [word_tokenize(i) for i in text]

def perform_padding(data):
    pass
#     return [list(np.pad(sent, (0, MAX_SENTENCE_LENGTH - len(sent)), 'constant', constant_values='0')) for sent in data]

In [12]:
def preprocess_data(data, isTrain=True):
    review = data["reviews"]
    review = convert_to_lower(review)
    review = remove_punctuation(review)
    review = perform_tokenization(review)
    review = remove_stopwords(review)
#     print(review[1])
    review = encode_data(review)
#     print(review[1])
    global vocab_len
    vocab_len=len(word_to_index)+1
    
    #review = perform_padding(review)
    return review

In [13]:
vocab_len=0

In [14]:
def embedding_layer():      
    emb_dim = EMBDIM
    global vocab_len
    print(vocab_len)
    emb_matrix = np.zeros((vocab_len,emb_dim))
#     print(emb_matrix.shape)
    for word, idx in word_to_index.items():
        try:
            emb_matrix[idx, :] = embeddings[word]
        except:
            pass
    
    embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_len,output_dim=emb_dim,weights=[emb_matrix],input_length=max_len,trainable=True)
    
    return embedding_layer

In [15]:
def softmax_activation(x):
    expX = K.exp(x-K.reshape(K.max(x, axis=1), (K.shape(x)[0], 1)))
    s = K.reshape(K.sum(expX, axis=1), (K.shape(x)[0], 1))
    return expX / s

In [65]:
class NeuralNet:

    def __init__(self, reviews, ratings, val_reviews,val_ratings):
        self.reviews = np.array(reviews, dtype='float32')
        print('train_data_shape {}'.format(self.reviews.shape))
        self.ratings = tf.keras.utils.to_categorical(y=ratings-1,num_classes=NUMBER_CLASSES)
        val_ratings = tf.keras.utils.to_categorical(y=val_ratings-1,num_classes=NUMBER_CLASSES)
        self.val_data=(val_reviews,val_ratings)
        self.model = None

    def build_nn(self):
        
        sentence_indices = tf.keras.layers.Input(shape=(max_len,),dtype='int32')
        embedding = embedding_layer()
        X = embedding(sentence_indices)
        X = tf.keras.layers.Flatten()(X)
#         X = tf.keras.layers.AveragePooling1D(pool_size=max_len)(X)
#         X = tf.keras.layers.Flatten()(X)
    
        for i in range(hidden_layer_count):
            X = tf.keras.layers.Dense(units=hidden_layer_numunits[i],kernel_initializer='glorot_uniform')(X)
            X = tf.keras.layers.Dropout(rate=0.2)(X)
            X=tf.keras.layers.Activation(activation=activation_name)(X)
        
        X = tf.keras.layers.Dense(units=5,activation='softmax')(X)
        
        self.model = tf.keras.Model(inputs=sentence_indices,outputs=X)
        self.model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])
        self.model.summary()
        
#         featureDim = self.reviews.shape[1]
#         self.model = tf.keras.Sequential()
#         self.model.add(tf.keras.layers.Dense(NUMBER_CLASSES, activation=softmax_activation, input_shape=(featureDim,)))
#         self.model.summary()
#         lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-3,decay_steps=10000,decay_rate=0.9)
#         self.model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
#                            metrics=['sparse_categorical_accuracy'])

    def train_nn(self, batch_size, epochs):
        es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", verbose=1,patience=3)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
        self.model.fit(self.reviews,self.ratings, epochs=epochs, batch_size=batch_size,validation_data=self.val_data,callbacks=[es])
#         self.model.fit(Generator(self.reviews,self.ratings,batch_size=batch_size), epochs=epochs, batch_size=batch_size,validation_data=self.val_data,callbacks=[es],steps_per_epoch=40)
    def scheduler(self,epoch, lr):
        if epoch < 5:
            return lr
        else:
            return lr * tf.math.exp(-0.1)
    def predict(self, reviews):
        reviews = np.array(reviews, dtype='float32')
        return np.argmax(self.model.predict(reviews), axis=1) + 1
    
    def predictWithPr(self, reviews):
        reviews = np.array(reviews, dtype='float32')
        return self.model.predict(reviews)


In [57]:
class Generator(tf.keras.utils.Sequence):
    # Class is a dataset wrapper for better training performance
    def __init__(self, x_set, y_set, batch_size=256):
        self.x, self.y = x_set, y_set
        y_temp=np.argmax(y_set,axis=1)+1
        self.batch_size = batch_size
        
        print(x_set.shape,y_set.shape)
            
    
        self.c1_review = x_set[y_temp==1,:]
        self.c2_review = x_set[y_temp==2,:]
        self.c3_review = x_set[y_temp==3,:]
        self.c4_review = x_set[y_temp==4,:]
        self.c5_review = x_set[y_temp==5,:]
        
        
        
        self.len1=len(self.c1_review)
        self.len2=len(self.c2_review)
        self.len3=len(self.c3_review)
        self.len4=len(self.c4_review)
        self.len5=len(self.c5_review)

        self.c1_ratings = y_set[y_temp==1,:]
        self.c2_ratings = y_set[y_temp==2,:]
        self.c3_ratings = y_set[y_temp==3,:]
        self.c4_ratings = y_set[y_temp==4,:]
        self.c5_ratings = y_set[y_temp==5,:]
        self.batch_1=int(batch_size*0.2)
        self.batch_2=int(batch_size*0.2)
        self.batch_3=int(batch_size*0.2)
        self.batch_4=int(batch_size*0.2)
        self.batch_5=self.batch_size-self.batch_1-self.batch_2-self.batch_3-self.batch_4

    def __len__(self):
        return math.ceil(self.x.shape[0] / self.batch_size)

    def __getitem__(self, idx):
        idx1=np.random.choice(np.arange(self.len1),replace=False,size=self.batch_1)
        idx2=np.random.choice(np.arange(self.len2),replace=False,size=self.batch_2)
        idx3=np.random.choice(np.arange(self.len3),replace=False,size=self.batch_3)
        idx4=np.random.choice(np.arange(self.len4),replace=False,size=self.batch_4)
        idx5=np.random.choice(np.arange(self.len5),replace=False,size=self.batch_5)
        
        batch_x=[]
        batch_x.extend(self.c1_review[idx1,:])
        batch_x.extend(self.c2_review[idx2,:])
        batch_x.extend(self.c3_review[idx3,:])
        batch_x.extend(self.c4_review[idx4,:])
        batch_x.extend(self.c5_review[idx5,:])
        batch_x=np.array(batch_x)
        
        batch_y=[]
        batch_y.extend(self.c1_ratings[idx1,:])
        batch_y.extend(self.c2_ratings[idx2,:])
        batch_y.extend(self.c3_ratings[idx3,:])
        batch_y.extend(self.c4_ratings[idx4,:])
        batch_y.extend(self.c5_ratings[idx5,:])
        batch_y=np.array(batch_y)
        
        return batch_x, batch_y
    

In [18]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [19]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_data, test_size=0.2)

In [20]:
train_ratings = np.array(train_df["ratings"])
train_reviews = np.array(preprocess_data(train_df))
val_ratings = np.array(val_df["ratings"])
val_reviews = np.array(preprocess_data(val_df, False))
test_reviews = preprocess_data(test_data, False)

In [21]:
# train_ratings = np.array(train_data["ratings"])
# train_reviews = np.array(preprocess_data(train_data))
# test_reviews = preprocess_data(test_data, False)

In [22]:
print(train_reviews[1])

[10996. 10049.  1676.  4517. 16351.  8102. 10830. 11451.  6171.  2639.
  6415.  9904.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.]


In [66]:
model = NeuralNet(train_reviews, train_ratings, val_reviews,val_ratings)
model.build_nn()

train_data_shape (40000, 50)
16600
Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 50, 300)           4980000   
_________________________________________________________________
flatten_5 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 256)               3840256   
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation_5 (Activation)    (None, 256)               0         
_________________________________________________________________
dense_11 (Dense)        

In [24]:
batch_size, epochs = 256, 100

In [67]:
model.train_nn(batch_size, epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


In [68]:
from sklearn.metrics import classification_report,confusion_matrix

print("=================Train data evaluation metrices==========================")
# evaluation_matrices(train_ratings, model.predict(train_reviews))

print(accuracy_score(train_ratings,model.predict(train_reviews)))
print(precision_recall_fscore_support(train_ratings,model.predict(train_reviews),average='weighted'))

0.9087
(0.9069289392005008, 0.9087, 0.9048030580895121, None)


In [78]:
print("=================Test data evaluation metrices==========================")

testPredictions = model.predict(test_reviews)
test_ground_truth = np.array(pd.read_csv('gold_test.csv')['ratings'])

# evaluation_matrices(test_ground_truth, testPredictions)
print(accuracy_score(test_ground_truth, testPredictions))
print(precision_recall_fscore_support(test_ground_truth, testPredictions,average='weighted'))


0.6831
(0.6461195182460066, 0.6831, 0.658885135673812, None)


In [80]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_ground_truth, testPredictions)

array([[ 871,   73,  132,   34,  161],
       [ 245,   72,  147,   53,  113],
       [ 116,   60,  329,  180,  226],
       [  40,   21,  199,  342,  802],
       [  61,   19,  129,  358, 5217]])

In [70]:
ip_data = pd.read_csv("input.csv")
ip_reviews = preprocess_data(ip_data, False)
pred = model.predictWithPr(ip_reviews)

print(pred)

[[1.0889481e-05 6.6216812e-06 1.6103895e-05 1.0148172e-03 9.9895155e-01]
 [6.3093895e-01 1.1721336e-01 1.5240605e-01 5.3481828e-02 4.5959737e-02]
 [3.9870939e-03 2.7707808e-03 5.7733301e-03 4.2444628e-02 9.4502413e-01]
 [7.9690570e-01 1.4611633e-01 4.6300188e-02 8.3301505e-03 2.3475683e-03]
 [4.6778068e-01 3.1021202e-01 9.6583299e-02 1.0485898e-01 2.0564996e-02]
 [1.2224510e-01 1.8808168e-01 2.2975633e-01 3.7861428e-01 8.1302680e-02]
 [6.1102964e-02 4.4543631e-02 1.1672455e-01 3.6286518e-01 4.1476372e-01]
 [4.7890833e-01 1.9727629e-01 1.1896280e-01 1.1856668e-01 8.6285912e-02]
 [5.2285904e-01 9.0626404e-02 8.9382775e-02 2.2898847e-01 6.8143323e-02]
 [8.0277771e-01 1.4733925e-01 2.4208011e-02 1.5756914e-02 9.9181440e-03]]


In [71]:
ip_data["rating"]=np.argmax(pred, axis=1) + 1
ip_data

Unnamed: 0.1,Unnamed: 0,reviews,rating
0,0,Amazing!! I love and swear by this stuff. A mu...,5
1,1,This product came in pieces .... would NOT rec...,1
2,2,This is awesome product,5
3,3,does not work disappointed.,1
4,4,Product is not bad but works.,1
5,5,Product is not good but works.,4
6,6,Product is good,5
7,7,product is not good,1
8,8,Product is bad,1
9,9,Product is not bad,1


In [46]:
np.sum(pred,axis=1)

array([1., 1.], dtype=float32)

In [77]:
model.model.save(filepath='./saved_model')

INFO:tensorflow:Assets written to: ./saved_model/assets


In [32]:
without_hidden_layer(word2vec_300d)
=================Train data evaluation metrices==========================
              precision    recall  f1-score   support

           1       0.89      0.92      0.90      3229
           2       0.81      0.94      0.87      1766
           3       0.72      0.86      0.78      2915
           4       0.76      0.55      0.64      5519
           5       0.92      0.94      0.93     26571

    accuracy                           0.88     40000
   macro avg       0.82      0.84      0.83     40000
weighted avg       0.88      0.88      0.87     40000

=================Test data evaluation metrices==========================
              precision    recall  f1-score   support

           1       0.63      0.64      0.64      1271
           2       0.23      0.21      0.22       630
           3       0.31      0.40      0.35       911
           4       0.36      0.22      0.27      1404
           5       0.83      0.88      0.86      5784

    accuracy                           0.67     10000
   macro avg       0.47      0.47      0.47     10000
weighted avg       0.66      0.67      0.66     10000

loss: 0.4769 - accuracy: 0.8615 - val_loss: 0.7254 - val_accuracy: 0.7344

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 5)