In [2]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import kagglehub
from kagglehub import KaggleDatasetAdapter
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


<h3>Load the dataset</h3>

In [3]:

df = pd.read_csv('fake_reviews_dataset.csv')

df

print(df.columns)

Index(['category', 'rating', 'label', 'text_'], dtype='object')


<h3>Pre Processing - Cleanup the dataset</h3>

In [4]:


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters, digits, and punctuation EXCEPT exclamation marks
    text = re.sub(r'[^a-z\s!]', '', text)  # Keep '!' for exclamation mark count

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Apply stemming (optional, depends on your model needs)
    words = [stemmer.stem(word) for word in words]

    # Join words back into a string
    cleaned_text = " ".join(words)

    return cleaned_text

# Example usage: Clean the 'text' column in the DataFrame
# Apply the cleaning function to the 'text_' column
df['cleaned_review'] = df['text_'].apply(clean_text)

# Display the cleaned reviews alongside the original ones
print(df[['text_', 'cleaned_review']].head())



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\24bry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\24bry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   

                                  cleaned_review  
0  love ! well made sturdi comfort love ! pretti  
1   love great upgrad origin ive mine coupl year  
2         pillow save back love look feel pillow  
3          miss inform use great product price !  
4            nice set good qualiti set two month  


<h3>Feature Engineering</h3>
Contains:
1. Review Length
2. Sentiment Polarity
3. Sentiment Subjectivity
4. ! Count
5. ? Count (maybe remove)
6. All-Caps word count
7. Superlative word count
8. Unique-word ratio
9. Rating score
10. Category one-hot encoding

In [6]:
import pandas as pd
from textblob import TextBlob

# assume df is already loaded and has columns: 
#   'text_' (raw review), 'cleaned_review' (preprocessed text), 
#   'rating' (numeric), and 'category' (string)

# 1. Review length (number of words in the cleaned text)
df['review_length'] = df['cleaned_review'].apply(lambda x: len(x.split()))

# 2. Sentiment polarity (range: -1 to +1)
df['sentiment_polarity'] = df['cleaned_review'].apply(
    lambda x: TextBlob(x).sentiment.polarity
)

# 3. Sentiment subjectivity (range: 0 to 1)
df['sentiment_subjectivity'] = df['cleaned_review'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity
)

# 4. Exclamation mark count (from raw text)
df['exclamation_count'] = df['text_'].apply(lambda x: x.count('!'))

# 5. Question mark count (from raw text)
df['question_count'] = df['text_'].apply(lambda x: x.count('?'))

# 6. All‑CAPS word count (indicates “shouting” or emphasis)
df['allcaps_count'] = df['text_'].apply(
    lambda txt: sum(1 for w in txt.split() if w.isupper() and len(w) > 1)
)

# 7. Superlative words count
superlatives = ['best', 'worst', 'perfect', 'amazing', 
                'excellent', 'fantastic', 'unbelievable']
df['superlative_count'] = df['cleaned_review'].apply(
    lambda x: sum(1 for w in x.split() if w in superlatives)
)

# 8. Unique‑word ratio (lexical diversity)
df['unique_word_ratio'] = df['cleaned_review'].apply(
    lambda txt: len(set(txt.split())) / max(len(txt.split()), 1)
)

# 9. Rating score (use the star rating directly)
df['rating_score'] = df['rating'].astype(float)

# 10. Category one‑hot encoding
df = pd.get_dummies(df, columns=['category'], prefix='cat')

# At this point, df contains all engineered features alongside:
#  - 'cleaned_review' (for TF‑IDF vectorization)
#  - 'label'         (your target: real vs. fake)



In [9]:
print(df[['review_length',
          'sentiment_polarity',
          'sentiment_subjectivity',
          'exclamation_count',
          'question_count',
          'allcaps_count',
          'superlative_count',
          'unique_word_ratio',
          'rating_score']].head())

   review_length  sentiment_polarity  sentiment_subjectivity  \
0              9               0.625                   0.600   
1              8               0.650                   0.675   
2              7               0.250                   0.300   
3              7               1.000                   0.750   
4              7               0.650                   0.800   

   exclamation_count  question_count  allcaps_count  superlative_count  \
0                  2               0              0                  0   
1                  0               0              0                  0   
2                  0               0              0                  0   
3                  1               0              0                  0   
4                  0               0              0                  0   

   unique_word_ratio  rating_score  
0           0.777778           5.0  
1           1.000000           5.0  
2           0.857143           5.0  
3           1.000000  

<h3>TF-IDF Vectorization of Cleand Review</h3>

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

# 1) Vectorize text
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')
X_text = tfidf.fit_transform(df['cleaned_review'])

# 2) Stack engineered features
numeric_cols = [
    'review_length','sentiment_polarity','sentiment_subjectivity',
    'exclamation_count','allcaps_count','superlative_count',
    'unique_word_ratio','rating_score'
]
X_num = df[numeric_cols].values
X = hstack([X_text, X_num])
y = df['label'].values

# Map the string labels to numeric
df['y'] = df['label'].map({'OR': 0, 'CG': 1})

# Stack your features into X as before
X = hstack([X_text, X_num])

#Pull the numeric target
y = df['y'].values

#Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)


<h3>Using Random Forest</h3>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 2) Predict on test set
y_pred = clf.predict(X_test)

# 3) Evaluate
print(classification_report(y_test, y_pred, target_names=['Real (OR)','Fake (CG)']))

# Optional: view confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

              precision    recall  f1-score   support

        real       0.86      0.85      0.85      6065
        fake       0.85      0.86      0.86      6065

    accuracy                           0.85     12130
   macro avg       0.85      0.85      0.85     12130
weighted avg       0.85      0.85      0.85     12130

Confusion matrix:
 [[5170  895]
 [ 864 5201]]


<h3>Logistic Regression</h3>

In [None]:
# Cell: Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1) Initialize and train
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)

# 2) Predict on test set
y_pred_lr = lr.predict(X_test)

# 3) Evaluate
print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr, target_names=['Real (OR)','Fake (CG)']))
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion matrix:\n", cm_lr)


<h3>SVM</h3>

In [None]:
# Cell: Support Vector Machine (linear kernel)
from sklearn.svm import SVC

# 1) Initialize and train
svc = SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced')
svc.fit(X_train, y_train)

# 2) Predict on test set
y_pred_svc = svc.predict(X_test)

# 3) Evaluate
print("=== SVM (Linear Kernel) ===")
print(classification_report(y_test, y_pred_svc, target_names=['Real (OR)','Fake (CG)']))
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("Confusion matrix:\n", cm_svc)


<h3>XGB Boost</h3>

In [None]:
# Cell: XGBoost Classifier
import xgboost as xgb

# 1) Initialize and train
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=(y_train==0).sum()/(y_train==1).sum()
)
xgb_clf.fit(X_train, y_train)

# 2) Predict on test set
y_pred_xgb = xgb_clf.predict(X_test)

# 3) Evaluate
print("=== XGBoost ===")
print(classification_report(y_test, y_pred_xgb, target_names=['Real (OR)','Fake (CG)']))
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion matrix:\n", cm_xgb)
