In [77]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from gensim.models import KeyedVectors
import gensim.downloader as api

In [79]:
# Load pre-trained Word2Vec embeddings
word2vec = api.load("word2vec-google-news-300")

In [80]:
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

#Drop Duplicates
train_data = train_data.drop_duplicates(subset='reviews').reset_index(drop=True)
print(train_data)

#Find Duplicates
findDuplicate = train_data.duplicated()
print(findDuplicate)

                                                reviews  sentiments
0     I bought this belt for my daughter in-law for ...           1
1     The size was perfect and so was the color.  It...           1
2     Fits and feels good, esp. for doing a swim rac...           1
3     These socks are absolutely the best. I take pi...           1
4     Thank you so much for the speedy delivery they...           1
...                                                 ...         ...
7180  I bought these shirts (black, medium) to wear ...           0
7181  At first, I thought this scarf might not be th...           1
7182  I am very picky when it comes to bras.  I want...           1
7183  This jacket is wind and water resistant, but n...           0
7184  These are extremely confortable. The material ...           1

[7185 rows x 2 columns]
0       False
1       False
2       False
3       False
4       False
        ...  
7180    False
7181    False
7182    False
7183    False
7184    False
Lengt

In [81]:
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabet characters
    return text

train_data['cleaned_reviews'] = train_data['reviews'].apply(clean_text)
test_data['cleaned_reviews'] = test_data['reviews'].apply(clean_text)
print(train_data[['reviews', 'cleaned_reviews']].head())
print(test_data[['reviews', 'cleaned_reviews']].head())

                                             reviews  \
0  I bought this belt for my daughter in-law for ...   
1  The size was perfect and so was the color.  It...   
2  Fits and feels good, esp. for doing a swim rac...   
3  These socks are absolutely the best. I take pi...   
4  Thank you so much for the speedy delivery they...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                             reviews  \
0  I bought 2 sleepers.  sleeper had holes in the...   
1  I dare say these are just about the sexiest th...   
2  everything about the transaction (price, deliv...   
3  Not bad for just a shirt.  Very durable, and m...   
4  These are truly wrinkle free and longer than t... 

In [82]:
#tokenize reviews
train_data['tokenized_reviews'] = train_data['cleaned_reviews'].apply(word_tokenize)
test_data['tokenized_reviews'] = test_data['cleaned_reviews'].apply(word_tokenize)
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())

                                   tokenized_reviews  \
0  [i, bought, this, belt, for, my, daughter, inl...   
1  [the, size, was, perfect, and, so, was, the, c...   
2  [fits, and, feels, good, esp, for, doing, a, s...   
3  [these, socks, are, absolutely, the, best, i, ...   
4  [thank, you, so, much, for, the, speedy, deliv...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [i, bought, sleepers, sleeper, had, holes, in,...   
1  [i, dare, say, these, are, just, about, the, s...   
2  [everything, about, the, transaction, price, d...   
3  [not, bad, for, just, a, shirt, very, durable,...   
4  [these, are, truly, wrinkle, free, and, longer... 

In [83]:
# Remove Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

train_data['tokenized_reviews'] = train_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
test_data['tokenized_reviews'] = test_data['tokenized_reviews'].apply(lambda tokens: [t for t in tokens if t not in stop_words])
print(train_data[['tokenized_reviews', 'cleaned_reviews']].head())
print(test_data[['tokenized_reviews', 'cleaned_reviews']].head())

                                   tokenized_reviews  \
0  [bought, belt, daughter, inlaw, christmas, loved]   
1    [size, perfect, color, looked, like, web, page]   
2  [fits, feels, good, esp, swim, race, highly, r...   
3  [socks, absolutely, best, take, pilates, class...   
4  [thank, much, speedy, delivery, came, time, re...   

                                     cleaned_reviews  
0  i bought this belt for my daughter inlaw for c...  
1  the size was perfect and so was the color  it ...  
2  fits and feels good esp for doing a swim race ...  
3  these socks are absolutely the best i take pil...  
4  thank you so much for the speedy delivery they...  
                                   tokenized_reviews  \
0  [bought, sleepers, sleeper, holes, arm, pit, a...   
1  [dare, say, sexiest, things, ive, ever, worn, ...   
2  [everything, transaction, price, delivery, tim...   
3  [bad, shirt, durable, matched, teams, colors, ...   
4  [truly, wrinkle, free, longer, average, womans... 

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
# Download required NLTK resources (if you haven't already)
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Lemmatize each word and join them back into a single string
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemmatized_words)

# Apply the lemmatization function to the 'reviews' column
train_data['lemmatized_reviews'] = train_data['tokenized_reviews'].apply(lemmatize_text)
test_data['lemmatized_reviews'] = test_data['tokenized_reviews'].apply(lemmatize_text)

# Check the results
print(train_data[['tokenized_reviews', 'lemmatized_reviews']])
print(test_data[['tokenized_reviews', 'lemmatized_reviews']])

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                      tokenized_reviews  \
0     [bought, belt, daughter, inlaw, christmas, loved]   
1       [size, perfect, color, looked, like, web, page]   
2     [fits, feels, good, esp, swim, race, highly, r...   
3     [socks, absolutely, best, take, pilates, class...   
4     [thank, much, speedy, delivery, came, time, re...   
...                                                 ...   
7180  [bought, shirts, black, medium, wear, daily, b...   
7181  [first, thought, scarf, might, good, quality, ...   
7182  [picky, comes, bras, want, something, support,...   
7183  [jacket, wind, water, resistant, waterproof, s...   
7184  [extremely, confortable, material, soft, cotto...   

                                     lemmatized_reviews  
0            bought belt daughter inlaw christmas loved  
1               size perfect color looked like web page  
2     fit feel good esp swim race highly recommend c...  
3     sock absolutely best take pilate class hot foo...  
4

In [95]:
# Prepare data for training
x, y = (train_data['reviews'].values, train_data['sentiments'].values)

# Tokenize and pad the reviews
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(x)
x_sequence = tokenizer.texts_to_sequences(x)
x_padding = pad_sequences(x_sequence, maxlen=32, padding='post')

# Create embedding matrix using Word2Vec
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map the words in our tokenizer's vocabulary to Word2Vec embeddings
for word, i in tokenizer.word_index.items():
    if word in word2vec.key_to_index:
        embedding_matrix[i] = word2vec[word]
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_padding, y, test_size=0.25, random_state=1)

# Build the RNN model
model = Sequential()

# Embedding layer with Word2Vec weights, non-trainable
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))

# LSTM layer (captures sequential data)
model.add(LSTM(128, return_sequences=False))

# Dense layer with ReLU activation
model.add(Dense(64, activation='relu'))

# Dropout layer to reduce overfitting
model.add(Dropout(0.5))

# Output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32
epochs = 5

history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
                    validation_data=(x_val, y_val))

# Evaluate the model performance on the test data
x_test_sequence = tokenizer.texts_to_sequences(test_data['reviews'])
x_test_padding = pad_sequences(x_test_sequence, maxlen=32, padding='post')
y_pred = (model.predict(x_test_padding) > 0.5).astype("int32")

# Add predictions to test_df
test_data['predicted_sentiments'] = y_pred

# Display first few predictions
print(test_data[['reviews', 'predicted_sentiments']].head())

Epoch 1/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.8364 - loss: 0.4769 - val_accuracy: 0.8447 - val_loss: 0.3747
Epoch 2/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8526 - loss: 0.3709 - val_accuracy: 0.8614 - val_loss: 0.3118
Epoch 3/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8808 - loss: 0.2903 - val_accuracy: 0.8687 - val_loss: 0.3003
Epoch 4/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8937 - loss: 0.2583 - val_accuracy: 0.8765 - val_loss: 0.2846
Epoch 5/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9124 - loss: 0.2193 - val_accuracy: 0.8742 - val_loss: 0.3107
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
                                             reviews  predicted_sentiments
0  I bought 2 sleepers.  sleeper had holes

In [97]:
# Save the results to a submission file
submission_data = test_data[['reviews', 'predicted_sentiments']]
submission_data.to_csv('Results_RNN.csv', index=False)