In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
path = "/kaggle/input/amazon-reviews"

In [3]:
df_train_original = pd.read_csv(f"{path}/train.csv" , header=None)
df_test_original = pd.read_csv(f'{path}/test.csv' , header = None)

In [4]:
df_train_original.columns = ['polarity', 'title', 'text']
df_test_original.columns = ['polarity', 'title', 'text']

In [6]:
from sklearn.model_selection import train_test_split

# Assuming df_train has columns like: ['text', 'polarity', ...]
df_train, _ = train_test_split(
    df_train_original,
    train_size=500_000,
    stratify=df_train_original['polarity'],  # keeps label balance
    random_state=42
)

df_test, _ = train_test_split(
    df_test_original,
    train_size=20_000,
    stratify=df_test_original['polarity'],  # keeps label balance
    random_state=42
)

print(df_train.shape)

print(df_test.shape)

(500000, 3)
(20000, 3)


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 551458 to 2966877
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   polarity  500000 non-null  int64 
 1   title     499968 non-null  object
 2   text      500000 non-null  object
dtypes: int64(1), object(2)
memory usage: 15.3+ MB


In [8]:
df_train.head()

Unnamed: 0,polarity,title,text
551458,2,Buen libro,Este libro... es una muy buena guia para dar e...
1988189,2,Great tasting salt for meats and other foods,"Try this product and you will never use ""old f..."
3333438,1,2002 RELEASE!,This is the 2002 release version and is twice ...
253226,1,Solar lights verus shady areas,Solar lighting doesn't work unless you have di...
3016970,2,Great tool,The rechargeable Dremel tool is a great multip...


In [9]:
# polarity column description
# 1 == Negative
# 2 == Positive
df_train['polarity'].unique()

array([2, 1])

In [10]:
df_train['polarity'].value_counts()

polarity
2    250000
1    250000
Name: count, dtype: int64

In [11]:
# LOWER CASE THE DATA OF COLUMN 'title'

df_train['title'] = df_train['title'].str.lower()

In [12]:
# LOWER CASE THE DATA OF COLUMN 'text'

df_train['text'] = df_train['text'].str.lower()

In [13]:
df_train['title'].sample(10)

2373965                                         not worth it
3415275                 good cheese, sausage could be better
2977235                                   five great minutes
1107519    florence fostrer jennings & friends murder on ...
1742776                        must have all breakbeat fans!
3576178    a few more calls to tech support and then it's...
1909329                   be aware if you are hypersensitive
1686298                        andy goldsworthy is a genius!
1803039                          the story starts to heat up
1637464                                     i'm only looking
Name: title, dtype: object

In [14]:
print("shape of df_train: ",df_train.shape)
print("shape of df_test: ",df_test.shape)

shape of df_train:  (500000, 3)
shape of df_test:  (20000, 3)


In [15]:
# nan values in df_train
df_train.isna().sum()

polarity     0
title       32
text         0
dtype: int64

In [16]:
#  nan values in df_test
df_test.isna().sum()

polarity    0
title       1
text        0
dtype: int64

In [17]:
print(df_train.duplicated().sum())
print(df_test.duplicated().sum())

6
0


In [18]:
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

In [19]:
# drop nan values
df_train = df_train.dropna()
df_test = df_test.dropna()

In [20]:
#  droping title as i think it is not needed for now

df_train = df_train.drop(['title'], axis=1)
df_test = df_test.drop(['title'], axis=1)

In [21]:
import string
translator = str.maketrans('','',string.punctuation)
digit = str.maketrans('','',string.digits)

In [22]:
# removing punctuations from title
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(translator))

In [23]:
# removing punctuations from text
df_train['text'] = df_train['text'].apply(lambda x: x.translate(translator))

In [24]:
# removing punctuations from text
df_test['text'] = df_test['text'].apply(lambda x: x.translate(translator))

In [25]:
df_train.sample(10)

Unnamed: 0,polarity,text
2248496,2,we got this train when it was on sale for arou...
626346,1,i bought this about a year ago and never playe...
3564019,2,this book was an awesome goosebumps this book ...
475990,2,like this music great price for some tunes you...
3241186,2,i am just now taking an intro film photo class...
645860,2,this is another winner from ana ravi i absolu...
692953,2,im used to extremely plush beach towels but th...
1651428,1,i enjoyed two of john cases other novels but t...
2873979,2,this phone is exactly what i needed the flexib...
1823641,1,i love these cartoons and i always have howeve...


In [26]:
# removing the numbers from the "title"
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(digit))

In [27]:
# removing the numbers from the "text"
df_train['text'] = df_train['text'].apply(lambda x: x.translate(digit))

In [28]:
# removing the numbers from the "text"
df_test['text'] = df_test['text'].apply(lambda x: x.translate(digit))

In [29]:
df_train.sample(10)

Unnamed: 0,polarity,text
911952,2,this swing is great especially if you do not h...
3459268,2,im not a fan of boy george but i was curious a...
2231607,2,most sociologists who have written about ritua...
1767858,2,it was an old picture i saw as a child so i en...
1360406,1,a poorly put together book with limited abilit...
655767,2,the sound of the album is predominantly an alt...
2419304,1,bought of them first day got the attention of...
239410,2,this pump works our entry fountain that comple...
1020770,1,product looks good but the heating element is ...
617612,2,this is a series of books written about a tri...


In [30]:
# removing extra spaces just in case
# df_train['title'] = df_train['title'].str.strip()
df_train['text'] = df_train['text'].str.strip()


In [31]:

df_test['text'] = df_test['text'].str.strip()

In [32]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [33]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package ave

True

In [34]:
stopword = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [35]:
from tqdm import tqdm
tqdm.pandas()

In [36]:
# removing stop words and also applying the tokenizations

def tokenization_and_stopwords_removal(text):
    tokenized_text = word_tokenize(text)
    texts = [word for word in tokenized_text if word.lower() not in stopword ]
    stemmed_text = [stemmer.stem(word) for word in texts]


    return " ".join(stemmed_text)

df_train['text'] = df_train['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 499962/499962 [07:37<00:00, 1093.25it/s]


In [37]:
df_test['text'] = df_test['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 19999/19999 [00:18<00:00, 1072.64it/s]


In [38]:

df_train.sample(10)

Unnamed: 0,polarity,text
2098626,1,year first button stop work unit littl use con...
275926,2,thing comfort long period time sound superb up...
398546,1,horrend movi bore slow move charact despic clo...
545043,2,hard find decent clear mint local store mani f...
1538293,1,super high grade playback differ machin tape b...
3243646,1,obviou whoever design okay cover book never re...
524673,2,sword sheath nice littl issu sword full tang s...
1510403,1,sleep sack arriv smell strongli moth ball two ...
1221191,2,great age isnt violent get hook easili youv go...
965043,2,god ellen burstyn realli oscar pictur score ac...


In [39]:
print("df_train['polarity] ",  df_train['polarity'].value_counts())
print("")
print("df_test['polarity] ",  df_test['polarity'].value_counts())


df_train['polarity]  polarity
2    249988
1    249974
Name: count, dtype: int64

df_test['polarity]  polarity
2    10000
1     9999
Name: count, dtype: int64


In [40]:
print(f"df_test is null or not\n\n {df_test.isnull().sum()}")
print('')
print(f"df_train is null or not\n\n {df_train.isnull().sum()}")

df_test is null or not

 polarity    0
text        0
dtype: int64

df_train is null or not

 polarity    0
text        0
dtype: int64


In [41]:
# now negative = 0, positive = 1 for simplicity

df_train['polarity'] = df_train['polarity'].map({2:1, 1:0})
df_test['polarity'] = df_test['polarity'].map({2:1,1:0})

In [42]:
X_train = df_train['text'].astype('str')
Y_train = df_train['polarity']
X_val = df_test['text'].astype('str')
Y_val = df_test['polarity']

In [43]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 25_000
tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>")

word_index = tokenizer.word_index
tokenizer.fit_on_texts(X_train)


w2v_model = Word2Vec(sentences=[t.split() for t in X_train],
                     vector_size=300,
                     window=3,
                     min_count=5,
                     workers=8
                     )



embedding_dim = 300
embedding_matrix = np.zeros((vocab_size , embedding_dim))

for word , i in word_index.items():
    if i < vocab_size:
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]

2025-11-14 07:28:22.804392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763105302.826314     330 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763105302.832922     330 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [44]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


Sequences = tokenizer.texts_to_sequences(X_train)
val_sequences = tokenizer.texts_to_sequences(X_val)



maxlen = 250
X_train = pad_sequences(sequences=Sequences, maxlen=maxlen, padding='post')

X_val = pad_sequences(sequences=val_sequences, maxlen=maxlen, padding='post')

In [48]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Embedding , Bidirectional , Dense , Dropout

model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=300,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=True
              ),

              Bidirectional(LSTM(32 , dropout = 0.5, recurrent_dropout=0.3)),

              Dropout(0.5),
              Dense(64, activation="relu"),
              Dense(1,activation="sigmoid")
])



In [49]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
          X_train , Y_train,
          validation_data=(X_val, Y_val),
          batch_size=32,
          epochs=3,
          verbose=1
          )

Epoch 1/3
[1m 3704/15624[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1:12:31[0m 365ms/step - accuracy: 0.4968 - loss: 0.6932

: 

In [None]:
loss, accuracy = model.evaluate(X_val, Y_val, verbose=1)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"loss fuction: {loss:.4f}")

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
model.save("sentiment_bilstm_word2vec.h5")