In [6]:
import numpy as np
import pandas as pd
data=pd.read_csv("products.csv",sep=",", encoding='latin-1')
data = data.dropna()
df = pd.DataFrame(data, columns=['Rate', 'Summary', 'Review'])
df['Rate'] = df['Rate'].apply(lambda x: 0 if not x.isnumeric() else int(x))


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Summary'])
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(df['Summary'])
max_sequence_length1 = max([len(seq) for seq in sequences])
X_s = pad_sequences(sequences, maxlen=max_sequence_length1)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Review'])
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(df['Review'])
max_sequence_length2 = max([len(seq) for seq in sequences])
max_sequence_length = max(max_sequence_length1,max_sequence_length2)
X_r = pad_sequences(sequences, maxlen=max_sequence_length2)


In [9]:
X = np.expand_dims(X_s, axis=2) * np.expand_dims(X_r, axis=1)
y = np.array(df['Rate'].astype(int))
y = np.where(y > 2, 1, 0)
X = X[:10000,:]
y = y[:10000]

In [10]:
X.shape[1]

108

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1:]), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='elu'))
model.add(Dense(32, activation='elu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)


loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 894.3329467773438
Accuracy: 0.7540000081062317


In [20]:
model.save('resources/lstm.h5')

In [21]:
from keras.models import load_model
model = load_model('resources/lstm.h5')

In [37]:
new_texts = [['Good', 'Very nice'],['Worst experience','Poar quality plastic material is not good.']]
summary = [sublist[1] for sublist in new_texts]
review = [sublist[0] for sublist in new_texts]

In [38]:

new_sequenceXs = tokenizer.texts_to_sequences(summary)
new_sequenceXr = tokenizer.texts_to_sequences(review)
new_Xs = pad_sequences(new_sequenceXs, maxlen=108)
new_Xr = pad_sequences(new_sequenceXr, maxlen=24)
tests = np.expand_dims(new_Xs, axis=2) * np.expand_dims(new_Xr, axis=1)

In [39]:
predictions = model.predict(tests)
for text, prediction in zip(new_texts, predictions):
    sentiment = 'Positive' if prediction >= 0.5 else 'Negative'
    print("Review:", text[0])
    print("Summary:", text[1])
    print("Sentiment:", sentiment)


Review: Good
Summary: Very nice
Sentiment: Positive
Review: Worst experience
Summary: Poar quality plastic material is not good.
Sentiment: Negative


In [12]:
# from sklearn.preprocessing import OneHotEncoder
# import numpy as np

# encoder = OneHotEncoder(sparse_output=False)

# X = encoder.fit_transform(X)
# X = X.reshape(-1, 1)


In [13]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# import numpy as np
# import re
# v = TfidfVectorizer()
# X_flat = X.ravel()
# X_tfidf = v.fit_transform(X_flat)
# y_int = y.astype(int)


###### 