# Loading and Pre-processing of Dataset

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Reading input file

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fake reviews dataset.csv")

In [3]:
# Initial look at Dataset

df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [4]:
# Text Pre-processing

import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords

# Function to remove punctuations
def text_process(mess):
  nopunc= [char for char in mess if char not in string.punctuation]
  nopunc=''.join(nopunc)
  return [''.join(word.lower()) for word in nopunc.split() if word.lower() not in stopwords.words('english')]

# Function to convert to string
def listToString(s):
  str1=" "
  return (str1.join(s))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Applying text pre-processing functions (takes 5 mins)

df['text_']=df['text_'].apply(text_process)
df['text_']=df['text_'].apply(listToString)

In [6]:
# Splitting the dataset into Train and Test

from sklearn.model_selection import train_test_split

y = df['label']
X_train, X_test, y_train, y_test = train_test_split(df['text_'], y, test_size = 0.25, random_state = 42)

In [7]:
# Word vector using CountVectorizer

# from sklearn.feature_extraction.text import CountVectorizer

# vector = CountVectorizer(stop_words = 'english')
# train = count_vector.fit_transform(X_train.values)
# test = count_vector.transform(X_test.values)

In [8]:
# Word vector using TF-IDF Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
train = vector.fit_transform(X_train)
test = vector.transform(X_test)

In [9]:
# Creating train dataframe

# train_df = pd.DataFrame(train.A, columns = vector.get_feature_names_out())
train_df = pd.DataFrame(train.A, columns = vector.get_feature_names_out())

# Trying Various Classification Models

In [10]:
# Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(train, y_train)
pred_lr = lr.predict(test)
score_lr = accuracy_score(y_test, pred_lr)
print("Accuracy score for Logistic Regression Classifier is: ", score_lr)

Accuracy score for Logistic Regression Classifier is:  0.8626830233478433


In [11]:
# Multinomial Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(train, y_train)
pred_mnb = mnb.predict(test)
score_mnb = accuracy_score(y_test, pred_mnb)
print("Accuracy score for Multinomial Naive Bayes Classifier is: ", score_mnb)

Accuracy score for Multinomial Naive Bayes Classifier is:  0.8474475662841314


In [12]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(train, y_train)
pred_rf = rf.predict(test)
score_rf = accuracy_score(y_test, pred_rf)
print("Accuracy score for Random Forest Classifier is: ", score_rf)

Accuracy score for Random Forest Classifier is:  0.8419074000791452


In [13]:
# AdaBoost Classifier

from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
ab.fit(train, y_train)
pred_ab = ab.predict(test)
score_ab = accuracy_score(y_test, pred_ab)
print("Accuracy score for AdaBoost Classifier is: ", score_ab)

Accuracy score for AdaBoost Classifier is:  0.7526711515631184


In [14]:
# GradientBoost Classifier

from sklearn.ensemble import GradientBoostingClassifier

xg = GradientBoostingClassifier()
xg.fit(train, y_train)
pred_xg = xg.predict(test)
score_xg = accuracy_score(y_test, pred_xg)
print("Accuracy score for GradientBoost Classifier is: ", score_xg)

Accuracy score for GradientBoost Classifier is:  0.7763157894736842


In [15]:
# Linear Support Vector Machine Classifier (takes 10 mins)

from sklearn.svm import SVC

svc = SVC()
svc.fit(train, y_train)
pred_svc = svc.predict(test)
score_svc = accuracy_score(y_test, pred_svc)
print("Accuracy score for Linear SVC is: ", score_svc)

Accuracy score for Linear SVC is:  0.87801741195093


In [16]:
# Non-Linear Support Vector Machine Classifier (takes 10 mins)

from sklearn.svm import NuSVC

nusvc = NuSVC()
nusvc.fit(train, y_train)
pred_nusvc = nusvc.predict(test)
score_nusvc = accuracy_score(y_test, pred_nusvc)
print("Accuracy score for Non-Linear SVC is: ", score_nusvc)

Accuracy score for Non-Linear SVC is:  0.8656509695290858


In [17]:
# Multi-Layer Perceptron Classifier (may take an hour)

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(train, y_train)
pred_mlp = mlp.predict(test)
score_mlp = accuracy_score(y_test, pred_mlp)
print("Accuracy score for Multi-Layer Perceptron Classifier is: ", score_mlp)



Accuracy score for Multi-Layer Perceptron Classifier is:  0.821626434507321


In [19]:
# Stacking Random Forest and Logistic Regression Classifiers (takes 14 mins)

from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

estimators = [('rf', RandomForestClassifier(random_state=42))]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack.fit(train, y_train)
pred_stack = stack.predict(test)
score_stack = accuracy_score(y_test, pred_stack)
print("Accuracy score for stacking classifer is: ", score_stack)

Accuracy score for stacking classifer is:  0.8501187178472497


# Performance Metrics for Best Performaing Model

In [22]:
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test, pred_lr, labels = ['CG', 'OR'])
cm_ab = confusion_matrix(y_test, pred_ab, labels = ['CG', 'OR'])
cm_rf = confusion_matrix(y_test, pred_rf, labels = ['CG', 'OR'])
cm_mnb = confusion_matrix(y_test, pred_mnb, labels = ['CG', 'OR'])
cm_svc = confusion_matrix(y_test, pred_svc, labels = ['CG', 'OR'])

In [23]:
precision_svc = cm_svc[0][0]/(cm_svc[0][0]+cm_svc[1][0])
recall_svc = cm_svc[0][0]/(cm_svc[0][0]+cm_svc[0][1])
print("For Logistic Regression:")
print("Accuracy: ", score_svc)
print("Precision: ", precision_svc)
print("Recall: ", recall_svc)
print("F1 score: ", 2*precision_svc*recall_svc/(precision_svc+recall_svc))

For Logistic Regression:
Accuracy:  0.87801741195093
Precision:  0.8992654774396642
Recall:  0.8505359269551409
F1 score:  0.8742221768846272


# Applying Trained Model on Main Dataset

In [24]:
# Downloading Datasets library

!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloadi

In [25]:
# Import Main dataset

from datasets import load_dataset
dataset = load_dataset("amazon_us_reviews", "Mobile_Electronics_v1_00", split="train")

Downloading builder script:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/195k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/60.0k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_us_reviews/Mobile_Electronics_v1_00 to /root/.cache/huggingface/datasets/amazon_us_reviews/Mobile_Electronics_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563...


Downloading data:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104975 [00:00<?, ? examples/s]

Dataset amazon_us_reviews downloaded and prepared to /root/.cache/huggingface/datasets/amazon_us_reviews/Mobile_Electronics_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563. Subsequent calls will reuse this data.


In [26]:
# Pre-processing of main dataset (takes 13 mins)

df = pd.DataFrame(dataset)
df['review_body'] = df['review_body'].apply(text_process)
df['review_body'] = df['review_body'].apply(listToString)

In [27]:
# Vectorization and Prediction on main dataset (takes 14 mins)

vectorized = vector.transform(df['review_body'])
ds = pd.DataFrame(vectorized.A, columns = vector.get_feature_names_out())
pred_main = svc.predict(ds)



In [28]:
# Counting number of Fake reviews

total = pred_main.shape[0]
count = 0
for i in pred_main:
  if i == "CG":
    count += 1
print("Number of fake reviews: ", count)
print("Ratio of fake reviews: ", count/total)

Number of fake reviews:  11214
Ratio of fake reviews:  0.10682543462729222
