In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras import backend as K
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from collections import defaultdict

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
tf.random.set_seed(1)
np.random.seed(1)
np.set_printoptions(threshold=np.inf)

max_len=30
NUMBER_CLASSES = 5
EMBDIM=300
hidden_layer_count=1
hidden_layer_numunits=[128]
embedding_file=''
activation_name='relu'
# vocab = dict()

In [36]:
word_to_index = defaultdict(int)
embeddings = {}#defaultdict(lambda:np.zeros((50,)))
f=open(embedding_file, 'r')
lines=f.readlines()
idx=1

for line in lines:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings[word] = vector
    word_to_index[word]=idx
    idx=idx+1

In [4]:
vocab_len=len(word_to_index)+1

In [5]:
def convert_to_lower(text):
    return [i.lower() for i in text]

def remove_punctuation(text):
    return [i.translate(str.maketrans(dict.fromkeys(string.punctuation))) for i in text]

In [6]:
def encode_data(text):
    
    m=len(text)
    text_encoded = np.zeros((m,max_len))
    for i in range(m):
        sentence_words =text[i]
        j = 0
        for w in sentence_words:
            text_encoded[i, j] = word_to_index.get(w)
            j = j + 1

            
    return text_encoded


In [93]:
def remove_stopwords(text):
    newtext = []
    f=open('stopwords.txt','r')
    stopwordList=f.readlines()
#     stopwordList = (stopwords.words('english'))
#     stopwordList=set(remove_punctuation(stopwordList))
    #print('-----', stopwordList, '---------')
    for tokens in text:
        newtext.append([w for w in tokens if not w in stopwordList])
    #print(newtext[0])
    return newtext

In [8]:
def perform_tokenization(text):
    return [word_tokenize(i) for i in text]

def perform_padding(data):
    pass
#     return [list(np.pad(sent, (0, MAX_SENTENCE_LENGTH - len(sent)), 'constant', constant_values='0')) for sent in data]

In [97]:
def preprocess_data(data, isTrain=True):
    review = data["reviews"]
    review = convert_to_lower(review)
    review = remove_punctuation(review)
    review = perform_tokenization(review)
    review = remove_stopwords(review)
    review = encode_data(review)
    #review = perform_padding(review)
    return review

In [117]:
def embedding_layer():
    
    vocab_size = len(word_to_index) + 1           
    emb_dim = EMBDIM
    emb_matrix = np.zeros((vocab_len,emb_dim))
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = embeddings[word]

    embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_len,output_dim=emb_dim,weights=[emb_matrix],input_length=max_len,trainable=True)
    
    return embedding_layer

In [11]:
def softmax_activation(x):
    expX = K.exp(x-K.reshape(K.max(x, axis=1), (K.shape(x)[0], 1)))
    s = K.reshape(K.sum(expX, axis=1), (K.shape(x)[0], 1))
    return expX / s

In [118]:
class NeuralNet:

    def __init__(self, reviews, ratings):
        self.reviews = np.array(reviews, dtype='float32')
        self.ratings = tf.keras.utils.to_categorical(y=ratings,num_classes=NUMBER_CLASSES)
        self.model = None

    def build_nn(self):
        
        sentence_indices = tf.keras.layers.Input(shape=(max_len,),dtype='int32')
        embedding = embedding_layer()
        X=embedding(sentence_indices)
        X = tf.keras.layers.Flatten()(X)
        for i in range(hidden_layer_count):
            X = tf.keras.layers.Dense(units=hidden_layer_numunits[i],kernel_initializer='glorot_uniform')(X)
            X=tf.keras.layers.Activation(activation=activation_name)(X)
        
        X = tf.keras.layers.Dense(units=5,activation='softmax')(X)
        
        self.model = tf.keras.Model(inputs=sentence_indices,outputs=X)
        self.model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])
        self.model.summary()
        
#         featureDim = self.reviews.shape[1]
#         self.model = tf.keras.Sequential()
#         self.model.add(tf.keras.layers.Dense(NUMBER_CLASSES, activation=softmax_activation, input_shape=(featureDim,)))
#         self.model.summary()
#         lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-3,decay_steps=10000,decay_rate=0.9)
#         self.model.compile(loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
#                            metrics=['sparse_categorical_accuracy'])

    def train_nn(self, batch_size, epochs):
        es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", verbose=1,patience=10)
        self.model.fit(self.reviews, self.ratings, epochs=epochs, batch_size=batch_size,validation_split=0.2,shuffle=1,callbacks=[es])

    def predict(self, reviews):
        reviews = np.array(reviews, dtype='float32')
        return np.argmax(self.model.predict(reviews), axis=1) + 1
    
    def predictWithPr(self, reviews):
        reviews = np.array(reviews, dtype='float32')
        return self.model.predict(reviews)


In [19]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [98]:
train_ratings = np.array(train_data["ratings"])
train_ratings=train_ratings-1
train_reviews = preprocess_data(train_data)
test_reviews = preprocess_data(test_data, False)

In [119]:
model = NeuralNet(train_reviews, train_ratings)
model.build_nn()

Model: "model_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        [(None, 30)]              0         
_________________________________________________________________
embedding_20 (Embedding)     (None, 30, 300)           120000300 
_________________________________________________________________
flatten_15 (Flatten)         (None, 9000)              0         
_________________________________________________________________
dense_36 (Dense)             (None, 128)               1152128   
_________________________________________________________________
activation_16 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 5)                 645       
Total params: 121,153,073
Trainable params: 121,153,073
Non-trainable params: 0
____________________________________________

In [48]:
batch_size, epochs = 256, 100

In [None]:
model.train_nn(batch_size, epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [116]:
from sklearn.metrics import classification_report,confusion_matrix

print("=================Train data evaluation metrices==========================")
# evaluation_matrices(train_ratings, model.predict(train_reviews))


print(classification_report(train_ratings+1,model.predict(train_reviews)))

              precision    recall  f1-score   support

           1       0.83      0.79      0.81      4059
           2       0.82      0.61      0.70      2265
           3       0.80      0.68      0.73      3612
           4       0.86      0.68      0.76      6871
           5       0.91      0.98      0.94     33193

    accuracy                           0.89     50000
   macro avg       0.84      0.75      0.79     50000
weighted avg       0.88      0.89      0.88     50000



In [115]:
print("=================Test data evaluation metrices==========================")



testPredictions = model.predict(test_reviews)
test_ground_truth = np.array(pd.read_csv('gold_test.csv')['ratings'])

# evaluation_matrices(test_ground_truth, testPredictions)

print(classification_report(test_ground_truth, testPredictions))

              precision    recall  f1-score   support

           1       0.58      0.48      0.53      1271
           2       0.24      0.12      0.16       630
           3       0.29      0.22      0.25       911
           4       0.29      0.15      0.20      1404
           5       0.73      0.91      0.81      5784

    accuracy                           0.64     10000
   macro avg       0.43      0.38      0.39     10000
weighted avg       0.58      0.64      0.60     10000



In [113]:
ip_data = pd.read_csv("input.csv")
ip_reviews = preprocess_data(ip_data, False)
pred = model.predictWithPr(ip_reviews)

print(pred)

[[4.3200071e-05 3.8046779e-05 4.3965574e-05 3.1264510e-03 9.9674833e-01]
 [3.1599912e-01 7.6396711e-02 8.7443165e-02 8.5956931e-02 4.3420410e-01]]


In [111]:
ip_data["rating"]=np.argmax(pred, axis=1) + 1
ip_data

Unnamed: 0.1,Unnamed: 0,reviews,rating
0,0,Amazing!! I love and swear by this stuff. A mu...,5
1,1,This product came in pieces .... would NOT rec...,5


In [None]:
np.sum(pred,axis=1)

In [56]:
without_hidden_layer(Glove_300d):

=================Train data evaluation metrices==========================
              precision    recall  f1-score   support

           1       0.65      0.54      0.59      4059
           2       0.54      0.17      0.25      2265
           3       0.52      0.25      0.34      3612
           4       0.55      0.19      0.28      6871
           5       0.76      0.96      0.85     33193

    accuracy                           0.74     50000
   macro avg       0.61      0.42      0.46     50000
weighted avg       0.70      0.74      0.69     50000


=================Test data evaluation metrices==========================
              precision    recall  f1-score   support

           1       0.60      0.43      0.50      1271
           2       0.22      0.06      0.09       630
           3       0.30      0.15      0.20       911
           4       0.27      0.08      0.12      1404
           5       0.68      0.94      0.79      5784

    accuracy                           0.63     10000
   macro avg       0.41      0.33      0.34     10000
weighted avg       0.55      0.63      0.56     10000

loss: 0.6703 - accuracy: 0.7599 - val_loss: 1.0118 - val_accuracy: 0.6307


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)