In [4]:

import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import joblib

# Download stopwords if missing
nltk.download('stopwords')
stop = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asafc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv("fake_reviews.csv")   # Use your dataset file name

print("Dataset Loaded Successfully")
print(df.head())
print(df.info())


Dataset Loaded Successfully
             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB
None


In [6]:
# print(df['label'].value_counts())

In [7]:
# df['rating'].hist()
# plt.title("Rating distribution")
# plt.show()

In [8]:
# df['text_len'] = df['text_'].str.len()
# sns.histplot(df['text_len'], bins=50)
# plt.title("Review length distribution")
# plt.show()


Text preprocessing

In [9]:
# import re
# import string
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop = set(stopwords.words('english'))

# def clean_text(s):
#     s = str(s).lower()
#     s = re.sub(r"http\S+|www\S+","", s)
#     s = re.sub(r"[^a-z0-9\s]", " ", s)
#     s = re.sub(r"\s+", " ", s).strip()
#     tokens = [w for w in s.split() if w not in stop]
#     return " ".join(tokens)

# df['text_clean'] = df['text_'].apply(clean_text)


In [10]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
# X_tfidf = tfidf.fit_transform(df['text_clean'])
# y = df['label'].map({'CG':1,'OR':0}).values   # adjust mapping if needed


In [11]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# MAX_WORDS = 30000
# MAX_LEN = 200

# tokenizer = Tokenizer(num_words=MAX_WORDS)
# tokenizer.fit_on_texts(df['text_clean'])
# seqs = tokenizer.texts_to_sequences(df['text_clean'])
# X_seq = pad_sequences(seqs, maxlen=MAX_LEN)


In [12]:
# ===================== TEXT CLEANING FUNCTION =====================
def clean_text(s):
    s = str(s).lower()                                      # Lowercase
    s = re.sub(r"http\S+|www\S+", "", s)                    # Remove links
    s = re.sub(r"[^a-z0-9\s]", " ", s)                      # Remove symbols
    s = re.sub(r"\s+", " ", s).strip()                      # Normalize spaces
    tokens = [w for w in s.split() if w not in stop]        # Remove stopwords
    return " ".join(tokens)


In [13]:
# ===================== APPLY CLEANING =====================
df["text_clean"] = df["text_"].apply(clean_text)

print("Cleaned text sample:")
print(df["text_clean"].head())


Cleaned text sample:
0    love well made sturdy comfortable love pretty
1    love great upgrade original mine couple years
2          pillow saved back love look feel pillow
3      missing information use great product price
4             nice set good quality set two months
Name: text_clean, dtype: object


In [14]:
# ===================== LABEL MAPPING =====================
# CG = Fake Review = 1
# OR = Real Review = 0

df["label"] = df["label"].map({"CG": 1, "OR": 0})
print(df["label"].value_counts())


label
1    20216
0    20216
Name: count, dtype: int64


In [15]:
# ===================== TF-IDF VECTORIZER =====================
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2)
)

X = tfidf.fit_transform(df["text_clean"])
y = df["label"]

print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (40432, 30000)


In [16]:
# ===================== TRAIN TEST SPLIT =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [17]:
# ===================== TRAIN MODEL =====================
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)

print("Model Training Completed")


Model Training Completed


In [18]:
# ===================== MODEL EVALUATION =====================
y_pred = lr.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Show sample probabilities
print("\nSample probabilities:")
print(lr.predict_proba(X_test)[:5])



Accuracy: 0.9063929763818475

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91      4044
           1       0.92      0.90      0.91      4043

    accuracy                           0.91      8087
   macro avg       0.91      0.91      0.91      8087
weighted avg       0.91      0.91      0.91      8087


Sample probabilities:
[[0.24644097 0.75355903]
 [0.61503535 0.38496465]
 [0.05146581 0.94853419]
 [0.78654542 0.21345458]
 [0.94409653 0.05590347]]


In [19]:
# ===================== SAVE MODEL & VECTORIZER =====================
os.makedirs("models", exist_ok=True)

joblib.dump(lr, "models/lr_tfidf_model.joblib")
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")

print("\n✅ Model and TF-IDF vectorizer saved successfully!")



✅ Model and TF-IDF vectorizer saved successfully!
