In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Reviews.csv')

In [3]:
data.dropna(how='any',inplace=True)
data.drop_duplicates(inplace=True, subset=['Score','Text'])
idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

In [4]:
def create_target(x):
    return 2 if x>3 else 0 if x<3 else 1
data['target'] = data['Score'].apply(create_target)

In [5]:
neutral = data.loc[data.target == 1]
positive = data.loc[data.target == 2].sample(50000)
negative = data.loc[data.target == 0].sample(50000)
data = pd.concat([positive, negative, neutral])

In [6]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
sentences = data['Text'].values
y = data['target'].values

In [8]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
y_cat = to_categorical(y)

In [9]:
sentences_train,sentences_test,y_train,y_test = train_test_split(sentences, y_cat, test_size=0.25, random_state=1000, stratify = y_cat)

In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

In [11]:
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [12]:
len(X_train[0])

55

In [13]:
vocab_size = len(tokenizer.word_index) + 1                          
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [14]:
X_train.shape

(97327, 100)

In [15]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath, 'r', encoding = 'utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [16]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights = [embedding_matrix]))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y = model.evaluate(x = X_test, y = y_test)
y[1]



0.6898252367973328