In [41]:
import pandas as pd

In [42]:
df = pd.read_csv('/content/drive/MyDrive/Mini-Project/NLP_Spam_Detection/flipkart_product.csv', encoding='iso-8859-1', on_bad_lines='skip', lineterminator='\n')

#Data Cleaning


In [43]:
df=df.loc[:,['Rate','Summary\r']]

In [44]:
df.isnull().sum()
df.dropna(inplace=True)

In [45]:
df.head()

Unnamed: 0,Rate,Summary\r
0,5,Great cooler.. excellent air flow and for this...
1,5,Best budget 2 fit cooler. Nice cooling\r
2,3,The quality is good but the power of air is de...
3,1,Very bad product it's a only a fan\r
4,3,Ok ok product\r


In [46]:
df.rename(columns={'Summary\r':'Summary'},inplace=True)

In [47]:
set(df['Rate'])

{'1',
 '2',
 '3',
 '4',
 '5',
 'Bajaj DX 2 L/W Dry Iron',
 'Nova Plus Amaze NI 10 1100 W Dry Iron?¿èGrey & Turquoise)',
 'Pigeon Favourite Electric Kettle?¿è1.5 L, Silver, Black)',
 's'}

In [48]:
list_value=['0','1','2','3','4','5']
df.drop(df[~df['Rate'].isin(list_value)].index,inplace=True)

In [49]:
df.shape

(185545, 2)

In [50]:
for index in df.index:
    df.loc[index, 'Rate'] = 0 if int(df.loc[index, 'Rate']) <= 2 else 1

In [51]:
df['Rate'].value_counts()

Unnamed: 0_level_0,count
Rate,Unnamed: 1_level_1
1,160192
0,25353


In [52]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [54]:
df['Summary']

Unnamed: 0,Summary
0,Great cooler.. excellent air flow and for this...
1,Best budget 2 fit cooler. Nice cooling\r
2,The quality is good but the power of air is de...
3,Very bad product it's a only a fan\r
4,Ok ok product\r
...,...
187430,Good\r
187431,Thanks\r
187432,Good\r
187433,Super\r


#Removing Stop words and Steming


In [55]:
corpus=[]
for i in df.index:
  review=re.sub('[^a-zA-z]',' ',df.loc[i,'Summary'])
  review=review.lower()
  review=review.split()
  # review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  review=[word for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)


In [56]:
# corpus

In [57]:
Y=df['Rate']

In [58]:
len(Y),len(corpus)

(185545, 185545)

#Split train-test Data

In [59]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,Y,test_size=0.20)

In [60]:
X1_train,X1_test,y1_train,y1_test=X_train,X_test,y_train,y_test

#Bag of words for Words Embeddings

In [63]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))

In [64]:
X_train=cv.fit_transform(X_train)
X_test=cv.transform(X_test)

In [65]:
X_train=X_train.toarray()
X_test=X_test.toarray()

In [66]:
len(X_train[0])

2500

In [67]:
cv.vocabulary_

{'good': 779,
 'service': 1964,
 'timely': 2224,
 'delivery': 456,
 'packing': 1479,
 'also': 28,
 'boy': 214,
 'good service': 851,
 'timely delivery': 2225,
 'packing also': 1480,
 'also good': 29,
 'good delivery': 798,
 'delivery boy': 458,
 'awesome': 82,
 'pad': 1482,
 'loved': 1180,
 'fast': 630,
 'ekart': 549,
 'fast delivery': 633,
 'product': 1629,
 'range': 1801,
 'good product': 843,
 'nice': 1347,
 'quality': 1747,
 'led': 1107,
 'good quality': 846,
 'quality product': 1778,
 'days': 433,
 'early': 532,
 'comment': 344,
 'osm': 1455,
 'suitable': 2128,
 'rough': 1893,
 'use': 2294,
 'print': 1615,
 'light': 1123,
 'weight': 2391,
 'value': 2326,
 'money': 1277,
 'nice good': 1360,
 'light weight': 1124,
 'value money': 2327,
 'worst': 2462,
 'ever': 567,
 'seen': 1951,
 'flipkart': 685,
 'bad': 99,
 'return': 1871,
 'refund': 1848,
 'option': 1446,
 'please': 1557,
 'something': 2046,
 'immediately': 1006,
 'disappointed': 498,
 'kind': 1080,
 'products': 1719,
 'take': 2

#Apply Naive Bayes

In [68]:
from sklearn.naive_bayes import MultinomialNB

In [69]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [70]:
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [71]:
y_pred=spam_detect_model.predict(X_test)

In [72]:
from sklearn.metrics import accuracy_score,classification_report

In [73]:
accuracy_score(y_test,y_pred)

0.9306637203912798

In [74]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.68      0.73      5030
           1       0.95      0.97      0.96     32079

    accuracy                           0.93     37109
   macro avg       0.86      0.83      0.84     37109
weighted avg       0.93      0.93      0.93     37109



In [75]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 3435,  1595],
       [  978, 31101]])

In [76]:
X_train.shape,y_train.shape

((148436, 2500), (148436,))

In [78]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus)
X = tokenizer.texts_to_sequences(corpus)

# Pad sequences to ensure uniform input length
max_len = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_len)

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(Y)

# Split the data into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [79]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))
model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])




In [80]:
# Train the model
model.fit(X1_train, y1_train, epochs=10, batch_size=32, validation_data=(X1_test, y1_test), verbose=2)


Epoch 1/10
4639/4639 - 36s - 8ms/step - accuracy: 0.9355 - loss: 0.1809 - val_accuracy: 0.9430 - val_loss: 0.1602
Epoch 2/10
4639/4639 - 37s - 8ms/step - accuracy: 0.9453 - loss: 0.1541 - val_accuracy: 0.9423 - val_loss: 0.1612
Epoch 3/10
4639/4639 - 41s - 9ms/step - accuracy: 0.9489 - loss: 0.1452 - val_accuracy: 0.9428 - val_loss: 0.1640
Epoch 4/10
4639/4639 - 41s - 9ms/step - accuracy: 0.9518 - loss: 0.1383 - val_accuracy: 0.9427 - val_loss: 0.1685
Epoch 5/10
4639/4639 - 42s - 9ms/step - accuracy: 0.9535 - loss: 0.1330 - val_accuracy: 0.9410 - val_loss: 0.1698
Epoch 6/10
4639/4639 - 41s - 9ms/step - accuracy: 0.9555 - loss: 0.1279 - val_accuracy: 0.9410 - val_loss: 0.1749
Epoch 7/10
4639/4639 - 41s - 9ms/step - accuracy: 0.9575 - loss: 0.1241 - val_accuracy: 0.9412 - val_loss: 0.1780
Epoch 8/10
4639/4639 - 41s - 9ms/step - accuracy: 0.9593 - loss: 0.1200 - val_accuracy: 0.9413 - val_loss: 0.1865
Epoch 9/10
4639/4639 - 42s - 9ms/step - accuracy: 0.9607 - loss: 0.1158 - val_accuracy: 

<keras.src.callbacks.history.History at 0x7a0521fcb1f0>

In [81]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


[1m1160/1160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step
Accuracy: 86.45%
