In [3]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import kagglehub
from kagglehub import KaggleDatasetAdapter
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter

<h3>Load the dataset</h3>

In [4]:

df = pd.read_csv('fake_reviews_dataset.csv')

df

print(df.columns)

Index(['category', 'rating', 'label', 'text_'], dtype='object')


<h3>Adding User into the dataset</h3>
<h5>To see the effect of fradulent user</h5>

<h3>Pre Processing - Cleanup the dataset</h3>

Download (run once)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [5]:


# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters, digits, and punctuation EXCEPT exclamation marks
    text = re.sub(r'[^a-z\s]', '', text) # Keep '!' for exclamation mark count

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Apply stemming (optional, depends on your model needs)
    words = [stemmer.stem(word) for word in words]

    # Join words back into a string
    cleaned_text = " ".join(words)

    return cleaned_text

# Example usage: Clean the 'text' column in the DataFrame
# Apply the cleaning function to the 'text_' column
df['cleaned_review'] = df['text_'].apply(clean_text)

# Display the cleaned reviews alongside the original ones
print(df[['text_', 'cleaned_review']].head())



                                               text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   

                                     cleaned_review  
0  love well made sturdi comfort love itveri pretti  
1      love great upgrad origin ive mine coupl year  
2            pillow save back love look feel pillow  
3               miss inform use great product price  
4               nice set good qualiti set two month  


<h3>Feature Engineering</h3>
Contains:
1. Review Length
2. Sentiment Polarity
3. Sentiment Subjectivity
4. ! Count
5. ? Count (maybe remove)
6. All-Caps word count
7. Superlative word count
8. Unique-word ratio
9. Rating score
10. Category one-hot encoding

In [7]:
print(df.columns.tolist())

['category', 'rating', 'label', 'text_', 'cleaned_review']


In [8]:
import pandas as pd
from textblob import TextBlob

# assume df is already loaded and has columns: 
#   'text_' (raw review), 'cleaned_review' (preprocessed text), 
#   'rating' (numeric), and 'category' (string)

# 1. Review length (number of words in the cleaned text)
df['review_length'] = df['cleaned_review'].apply(lambda x: len(x.split()))

# 2. Sentiment polarity (range: -1 to +1)
df['sentiment_polarity'] = df['cleaned_review'].apply(
    lambda x: TextBlob(x).sentiment.polarity
)

# 3. Sentiment subjectivity (range: 0 to 1)
df['sentiment_subjectivity'] = df['cleaned_review'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity
)

# 4. Exclamation mark count (from raw text)
# df['exclamation_count'] = df['text_'].apply(lambda x: x.count('!'))

# 5. Question mark count (from raw text)
# df['question_count'] = df['text_'].apply(lambda x: x.count('?'))

# 6. All‑CAPS word count (indicates “shouting” or emphasis)
df['allcaps_count'] = df['text_'].apply(
    lambda txt: sum(1 for w in txt.split() if w.isupper() and len(w) > 1)
)

# 7. Superlative words count
superlatives = ['best', 'worst', 'perfect', 'amazing', 
                'excellent', 'fantastic', 'unbelievable']
df['superlative_count'] = df['cleaned_review'].apply(
    lambda x: sum(1 for w in x.split() if w in superlatives)
)

# 8. Unique‑word ratio (lexical diversity)
df['unique_word_ratio'] = df['cleaned_review'].apply(
    lambda txt: len(set(txt.split())) / max(len(txt.split()), 1)
)

# 9. Rating score (use the star rating directly)
df['rating_score'] = df['rating'].astype(float)

# 10. Category one‑hot encoding
# At this point, df contains all engineered features alongside:
#  - 'cleaned_review' (for TF‑IDF vectorization)
#  - 'label'         (your target: real vs. fake)



In [9]:
print(df[['review_length',
          'sentiment_polarity',
          'sentiment_subjectivity',
          'allcaps_count',
          'superlative_count',
          'unique_word_ratio',
          'rating_score']].head())

   review_length  sentiment_polarity  sentiment_subjectivity  allcaps_count  \
0              8                0.50                   0.600              0   
1              8                0.65                   0.675              0   
2              7                0.25                   0.300              0   
3              6                0.80                   0.750              0   
4              7                0.65                   0.800              0   

   superlative_count  unique_word_ratio  rating_score  
0                  0           0.875000           5.0  
1                  0           1.000000           5.0  
2                  0           0.857143           5.0  
3                  0           1.000000           1.0  
4                  0           0.857143           5.0  


<h3>TF-IDF Vectorization of Cleand Review</h3>

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

# 1) Vectorize text
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')
X_text = tfidf.fit_transform(df['cleaned_review'])

# 2) Stack engineered features
numeric_cols = [
    'review_length','sentiment_polarity','sentiment_subjectivity',
    'allcaps_count','superlative_count',
    'unique_word_ratio','rating_score'
]
X_num = df[numeric_cols].values
X = hstack([X_text, X_num])
y = df['label'].values

# Map the string labels to numeric
df['y'] = df['label'].map({'OR': 0, 'CG': 1})

# Stack your features into X as before
X = hstack([X_text, X_num])

#Pull the numeric target
y = df['y'].values

#Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)


<h3>Cross Validation and Advanced Parameter Tuning</h3>
<h5>To get the best parameter</h5>

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score

# Cross-validation
cv_scores = cross_val_score(xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                            X, y, cv=5, scoring='accuracy', n_jobs=-1)

print(f'Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}')

# Hyperparameter Tuning with Optuna
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }
    
    model = xgb.XGBClassifier(**param)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)


[I 2025-04-21 23:09:52,886] A new study created in memory with name: no-name-a8cc6cf9-aaef-4d15-b7c1-f0e4a3a3757a


Cross-validation accuracy: 0.8278 ± 0.0205


  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
[I 2025-04-21 23:10:34,020] Trial 0 finished with value: 0.8436933223413026 and parameters: {'n_estimators': 422, 'learning_rate': 0.026953588573301247, 'max_depth': 5, 'subsample': 0.6057119029045536, 'colsample_bytree': 0.7554101727997871}. Best is trial 0 with value: 0.8436933223413026.
  'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
[I 2025-04-21 23:11:27,237] Trial 1 finished with value: 0.885572959604287 and parameters: {'n_estimators': 414, 'learning_rate': 0.1960969187342621, 'max_depth': 6, 'subsample': 0.9431501804055393, 'colsample_bytree': 0.7278118627017656}. Best is trial 1 with value: 0.885572959604287.
  'learnin

Best Parameters: {'n_estimators': 459, 'learning_rate': 0.18553023284560893, 'max_depth': 7, 'subsample': 0.881496362923568, 'colsample_bytree': 0.6965242241037549}
Best Accuracy: 0.8902720527617477


<h4>Training with the best parameter</h4>

In [13]:
# Get the best parameters from the Optuna study
best_params = study.best_params

# Initialize the XGBoost model with the best parameters
model = xgb.XGBClassifier(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Train the model on the full training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy:.4f}')

# Optionally, print the classification report and confusion matrix for detailed evaluation
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy on test set: 0.8881
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      6065
           1       0.89      0.88      0.89      6065

    accuracy                           0.89     12130
   macro avg       0.89      0.89      0.89     12130
weighted avg       0.89      0.89      0.89     12130

Confusion Matrix:
[[5421  644]
 [ 713 5352]]


<h3>Using Random Forest</h3>

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 2) Predict on test set
y_pred = clf.predict(X_test)

# 3) Evaluate
print(classification_report(y_test, y_pred, target_names=['Real (OR)','Fake (CG)']))

# Optional: view confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

              precision    recall  f1-score   support

   Real (OR)       0.85      0.86      0.86      6065
   Fake (CG)       0.86      0.85      0.86      6065

    accuracy                           0.86     12130
   macro avg       0.86      0.86      0.86     12130
weighted avg       0.86      0.86      0.86     12130

Confusion matrix:
 [[5209  856]
 [ 894 5171]]


<h3>Logistic Regression</h3>

In [7]:
# Cell: Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)

# 2) Predict on test set
y_pred_lr = lr.predict(X_test)

# 3) Evaluate
print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr, target_names=['Real (OR)','Fake (CG)']))
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion matrix:\n", cm_lr)


=== Logistic Regression ===
              precision    recall  f1-score   support

   Real (OR)       0.86      0.86      0.86      6065
   Fake (CG)       0.86      0.85      0.86      6065

    accuracy                           0.86     12130
   macro avg       0.86      0.86      0.86     12130
weighted avg       0.86      0.86      0.86     12130

Confusion matrix:
 [[5195  870]
 [ 880 5185]]


<h3>SVM</h3>

In [None]:
# Cell: Support Vector Machine (linear kernel)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
svc = SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced')
svc.fit(X_train, y_train)

# 2) Predict on test set
y_pred_svc = svc.predict(X_test)

# 3) Evaluate
print("=== SVM (Linear Kernel) ===")
print(classification_report(y_test, y_pred_svc, target_names=['Real (OR)','Fake (CG)']))
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("Confusion matrix:\n", cm_svc)


<h3>XGB Boost</h3>

In [10]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from tqdm import tqdm

# Define parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.1, 0.2, 0.3],
#     'max_depth': [3, 6, 10],
#     'subsample': [0.6, 0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0]
# }

# For RandomizedSearchCV,
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 6, 10, 15],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9]
}

tqdm.pandas()

# GridSearch
# xgb_grid_search = GridSearchCV(xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                            #    param_grid, cv=2, scoring='accuracy', n_jobs=-1)
# xgb_grid_search.fit(X_train, y_train)


# RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                                       param_distributions=param_dist,
                                       n_iter=10,  # Number of random combinations to try
                                       cv=3,
                                       scoring='accuracy',
                                       n_jobs=-1)

xgb_random_search.fit(X_train, y_train)


print("Best Parameters:", xgb_random_search.best_params_)
print("Best Accuracy:", xgb_random_search.best_score_)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best Accuracy: 0.8632958801498128


In [10]:
# Cell: XGBoost Classifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum()
)
xgb_clf.fit(X_train, y_train)

# 2) Predict on test set
y_pred_xgb = xgb_clf.predict(X_test)

# 3) Evaluate
print("=== XGBoost ===")
print(classification_report(y_test, y_pred_xgb, target_names=['Real (OR)','Fake (CG)']))
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion matrix:\n", cm_xgb)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
              precision    recall  f1-score   support

   Real (OR)       0.86      0.87      0.87      6065
   Fake (CG)       0.87      0.86      0.86      6065

    accuracy                           0.86     12130
   macro avg       0.86      0.86      0.86     12130
weighted avg       0.86      0.86      0.86     12130

Confusion matrix:
 [[5294  771]
 [ 871 5194]]
