In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
df_train = pd.read_csv("/kaggle/input/amazon-reviews/train.csv" , header=None)
df_test = pd.read_csv('/kaggle/input/amazon-reviews/test.csv' , header = None)

In [7]:
df_train.columns = ['polarity', 'title', 'text']
df_test.columns = ['polarity', 'title', 'text']

In [8]:
df_train_raw = df_train.copy(deep=True)
df_test_raw = df_train.copy(deep=True)

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   polarity  int64 
 1   title     object
 2   text      object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB


In [10]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [11]:
# polarity column description
# 1 == Negative
# 2 == Positive
df_train['polarity'].unique()

array([2, 1])

In [12]:
df_train['polarity'].value_counts()

polarity
2    1800000
1    1800000
Name: count, dtype: int64

In [13]:
# LOWER CASE THE DATA OF COLUMN 'title'

df_train['title'] = df_train['title'].str.lower()

In [14]:
# LOWER CASE THE DATA OF COLUMN 'text'

df_train['text'] = df_train['text'].str.lower()

In [15]:
df_train['title'].sample(10)

406534                great fun for kids but not to assemble
112578     this book combined with &#65533;psychic gifts&...
3169790                                   perfect for citrus
3529103                                  love perreti books!
546390                                     mummy laid an egg
1996509                       an insightful, delightful book
1401521                         jesus christ superstar(1973)
634646                               very nice for the price
1086578                                          hidden gem.
2918966                                    a disappointment.
Name: title, dtype: object

In [16]:
print("shape of df_train: ",df_train.shape)
print("shape of df_test: ",df_test.shape)

shape of df_train:  (3600000, 3)
shape of df_test:  (400000, 3)


In [17]:
# nan values in df_train
df_train.isna().sum()

polarity      0
title       207
text          0
dtype: int64

In [18]:
#  nan values in df_test
df_test.isna().sum()

polarity     0
title       24
text         0
dtype: int64

In [20]:
print(df_train.duplicated().sum())
print(df_test.duplicated().sum())

337
0


In [21]:
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

In [22]:
# drop nan values 
df_train = df_train.dropna()
df_test = df_test.dropna()

In [23]:
#  droping title as i think it is not needed for now 

df_train = df_train.drop(['title'], axis=1)
df_test = df_test.drop(['title'], axis=1)

In [24]:
import string 
translator = str.maketrans('','',string.punctuation)
digit = str.maketrans('','',string.digits)

In [25]:
# removing punctuations from title
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(translator))

In [26]:
# removing punctuations from text
df_train['text'] = df_train['text'].apply(lambda x: x.translate(translator))

In [27]:
# removing punctuations from text
df_test['text'] = df_test['text'].apply(lambda x: x.translate(translator))

In [29]:
df_train.sample(10)

Unnamed: 0,polarity,text
3201398,1,i recommend against buying this album because ...
2197633,2,this book contains much analysis not found any...
2309678,2,judith mcnaught is a great author and her book...
2415526,2,i hate all anime the only exception is cowboy...
1362314,2,i just watched another movie by nora roberts t...
336586,1,i am a fan of rohrs but reading this book did ...
1907494,2,my 2 and 4 yearolds really liked this book the...
1280478,2,great summary of where we are today with the c...
2856796,2,my own kids are grown or nearly grown but afte...
1461875,2,the effectiveness of the sonicare ps1 is most ...


In [30]:
# removing the numbers from the "title" 
# df_train['title'] = df_train['title'].apply(lambda x: x.translate(digit))

In [31]:
# removing the numbers from the "text"
df_train['text'] = df_train['text'].apply(lambda x: x.translate(digit))

In [32]:
# removing the numbers from the "text"
df_test['text'] = df_test['text'].apply(lambda x: x.translate(digit))

In [33]:
df_train.sample(10)

Unnamed: 0,polarity,text
451198,2,as for the tapes they are recommended for my h...
444033,1,this seemed watered down it says shake well on...
294396,1,boring its hard to believe that this book made...
1970621,2,i purchased this for a gift but own it myself ...
697600,2,flipped by wendelin van draanen is a really go...
3508404,1,i bought this locket and it got sent to me bro...
739904,2,it fits our old bowl and was a reasonably pric...
2408946,1,avoid this one hemmingson is normally a bette...
125004,1,ive had this trimmer for like years and i onl...
2377971,2,from day one i fell in love with ruben studdar...


In [34]:
# removing extra spaces just in case 
# df_train['title'] = df_train['title'].str.strip()
df_train['text'] = df_train['text'].str.strip()


In [35]:

df_test['text'] = df_test['text'].str.strip()

In [36]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  


In [37]:
stopword = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [38]:
from tqdm import tqdm
tqdm.pandas()

In [39]:
# removing stop words and also applying the tokenizations

def tokenization_and_stopwords_removal(text):
    tokenized_text = word_tokenize(text)
    texts = [word for word in tokenized_text if word.lower() not in stopword ]
    stemmed_text = [stemmer.stem(word) for word in texts]
    

    return " ".join(stemmed_text)

df_train['text'] = df_train['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 3599456/3599456 [54:17<00:00, 1105.06it/s]


In [40]:
df_test['text'] = df_test['text'].progress_apply(tokenization_and_stopwords_removal)

100%|██████████| 399976/399976 [05:53<00:00, 1130.71it/s]


In [41]:

df_train.sample(10)

Unnamed: 0,polarity,text
1748947,1,like poetri poem book written style creativ sh...
572609,1,penn relev thing say unfortun version applewoo...
1619953,2,well guysi laweath nicei went bookstor start l...
2008452,2,chimera defin great album song song bonu cd pr...
2897368,2,uhh better right go realli disappoint play gam...
2447967,1,tile suggest rastafarian exlplain rastafarian ...
1702038,2,known qualiti pictur appear life magazin still...
2930495,2,tri tub drain run slow littl difficult put dra...
214676,2,wait movi dvd tri preorder see still theatr mi...
1332818,2,hold well without crunchi sticki retain mallea...


In [42]:
print("df_train['polarity] ",  df_train['polarity'].value_counts())
print("")
print("df_test['polarity] ",  df_test['polarity'].value_counts())


df_train['polarity]  polarity
2    1799853
1    1799603
Name: count, dtype: int64

df_test['polarity]  polarity
2    199992
1    199984
Name: count, dtype: int64


In [43]:
print(f"df_test is null or not\n\n {df_test.isnull().sum()}")
print('')
print(f"df_train is null or not\n\n {df_train.isnull().sum()}")

df_test is null or not

 polarity    0
text        0
dtype: int64

df_train is null or not

 polarity    0
text        0
dtype: int64


In [44]:
# now negative = 0, positive = 1 for simplicity

df_train['polarity'] = df_train['polarity'].map({2:1, 1:0})
df_test['polarity'] = df_test['polarity'].map({2:1,1:0})

In [6]:
X_train = df_train['text'].astype('str')
Y_train = df_train['polarity']
X_val = df_test['text'].astype('str')
Y_val = df_test['polarity']

NameError: name 'df_train' is not defined

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2025-11-12 17:55:12.426358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762970112.685712     252 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762970112.756351     252 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [55]:
vocab_size = 100000
tokenizer = Tokenizer(num_words=vocab_size , oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

Sequences = tokenizer.texts_to_sequences(X_train)
val_sequences = tokenizer.texts_to_sequences(X_val)

In [4]:
word_index = tokenizer.word_index

maxlen = 800
X_train = pad_sequences(sequences=Sequences, maxlen=maxlen, padding='post')

X_val = pad_sequences(sequences=val_sequences, maxlen=maxlen, padding='post')

NameError: name 'tokenizer' is not defined

In [2]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=[t.split() for t in X_train],
                     vector_size=300,
                     window=3,
                     min_count=5,
                     workers=8
                     )


import numpy as np

embedding_dim = 300
embedding_matrix = np.zeros((vocab_size , embedding_dim))

for word , i in word_index.items():
    if i < vocab_size:
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]

NameError: name 'X_train' is not defined

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Embedding , Bidirectional , Dense , Dropout 

model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=300,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=True
              ),

              Bidirectional(LSTM(128, return_sequences=True, dropout = 0.5, recurrent_dropout=0.3)),
              Bidirectional(LSTM(64, dropout = 0.5, recurrent_dropout=0.3)),
              
              Dropout(0.5),
              Dense(64, activation="relu"),
              Dense(1,activation="sigmoid")
])

SyntaxError: positional argument follows keyword argument (2840052059.py, line 7)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train , Y_train,
          validation_data=(X_val, Y_val),
          batch_size=1024,
          epochs=3,
          verbose=1)

In [None]:
loss, accuracy = model.evaluate(X_val, Y_val, verbose=1)
print(f"Validation Accuracy: {accuracy:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
model.save("sentiment_bilstm_word2vec.h5")
