### This Python Notebook is part of Post Graduation Research, which leverages machine learning algorithms and natural language processing techniques to detect fake reviews, ensuring authenticity in user-generated content.

#### Note: 1. Trained and Tested on TensorFlow version: 2.12.0 and Keras version: 2.12.0 
#### Note: 2. Dataset is uploaded in Git Repository and can be downloaded from (https://osf.io/3vds7)

## Importing required libraries:

In [None]:
# Numerical and data manipulation libraries
import numpy as np
import pandas as pd

# Visualization library
import matplotlib.pyplot as plt

# NLTK libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Sklearn libraries for text feature extraction and machine learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Regular expression and string manipulation
import re
import string

# Initialize stop words
stop_words = stopwords.words('english')

### Loading dataset and preprocessing:

In [None]:
df = pd.read_csv("reviews-dataset.csv")

df["text_"] = df["text_"].apply(lambda x: x.lower()) #lowercase
data = df[["text_","label"]]
data["label"] = data["label"].apply(lambda x: 1 if x=="CG" else 0) #label_encoding

### Train-Test Split:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["text_"], data["label"], test_size=0.33,random_state=42)

In [None]:
X_train.index

In [None]:
print(len(X_train),len(y_train))
print(len(X_test),len(y_test))

### tf_idf:

In [None]:
# tf_idf
tf_idf = TfidfVectorizer()

# applying tf idf to training data
X_train_tf = tf_idf.fit_transform(X_train)

X_train_tf = tf_idf.transform(X_train)

In [None]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

In [None]:
# transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(X_test)

In [None]:
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

## Analysis using Naive Bayes (NB) Classifier:

In [None]:
# Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)

In [None]:
# predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
print("Confusion Matrix (for NB):\n", metrics.confusion_matrix(y_test, y_pred))

# Analysis using Support Vector Machine (SVM):


In [None]:
from sklearn.svm import SVC 

clf = SVC(kernel='linear') 

In [None]:
clf.fit(X_train_tf, y_train)

In [None]:
y_pred = clf.predict(X_test_tf)

In [None]:
print(metrics.classification_report(y_test, y_pred, target_names=['Positive', 'Negative']))

In [None]:
print("Confusion Matrix (for SVM):\n", metrics.confusion_matrix(y_test, y_pred))

# Analysis using Long Short-term Memory (LSTM):

In [None]:
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer

counter = Counter(" ".join(X_train).split(" "))

In [None]:
len(counter.keys())

In [None]:
myTokenizer = Tokenizer(num_words=300)
myTokenizer.fit_on_texts(X_train)
print(myTokenizer.word_index)

In [None]:
X_train_seq = myTokenizer.texts_to_sequences(X_train)
X_test_seq = myTokenizer.texts_to_sequences(X_test)

In [None]:
from keras_preprocessing.sequence import pad_sequences

# Set the maximum number of words per document (for both training and testing)
max_words = 300

# Pad sequences in X_train and X_test
X_train_fin = pad_sequences(X_train_seq, maxlen=max_words)
X_test_fin = pad_sequences(X_test_seq, maxlen=max_words)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

vocabulary_size = len(counter.keys())
model = Sequential()

model.add(Embedding(vocabulary_size, 32, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# training the model

history = model.fit(X_train_fin, y_train, validation_data = [X_test_fin, y_test],use_multiprocessing=True ,epochs=25)

In [None]:
pd.DataFrame(history.history).to_csv("lstm")

In [None]:
# list all data in history
print(history.history.keys())

Fig, ax =plt.subplots(1,2,figsize=(10,3))

# summarize history for accuracy
ax[0].plot(history.history['accuracy'])
ax[0].plot(history.history['val_accuracy'])
ax[0].set_title('model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'test'], loc='upper left')
# summarize history for loss
ax[1].plot(history.history['loss'])
ax[1].plot(history.history['val_loss'])
ax[1].set_title('model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'test'], loc='upper left')
plt.tight_layout()
plt.savefig("lstm",dpi=400)

In [None]:
pred = model.predict(X_test_fin)

In [None]:
print(metrics.classification_report(y_test, [round(i[0]) for i in pred], target_names=['fake', 'correct']))

In [None]:
print("Confusion Matrix (for LSTM)\n:", metrics.confusion_matrix(y_test, [round(i[0]) for i in pred]))

In [None]:
print("Accuracy: ", (6174+5992)/(6174+5992+467+710)) # change values accordingly

# Analysis using Multilayer Perceptron (MLP):

In [None]:
from keras.models import Sequential
from keras.layers import Dense

# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=X_train_tf.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# training the model

history = model.fit(X_train_tf.toarray(), y_train, validation_data = [X_test_tf.toarray(), y_test],use_multiprocessing=True ,epochs=25)

In [None]:
pd.DataFrame(history.history).to_csv("mlp")

In [None]:
# list all data in history
print(history.history.keys())

Fig, ax =plt.subplots(1,2,figsize=(10,3))

# summarize history for accuracy
ax[0].plot(history.history['accuracy'])
ax[0].plot(history.history['val_accuracy'])
ax[0].set_title('model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'test'], loc='upper left')
# summarize history for loss
ax[1].plot(history.history['loss'])
ax[1].plot(history.history['val_loss'])
ax[1].set_title('model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'test'], loc='upper left')
plt.tight_layout()
plt.savefig("mlp",dpi=400)

In [None]:
# predicting 
pred = model.predict(X_test_tf.toarray())

In [None]:
print(metrics.classification_report(y_test, [round(i[0]) for i in pred], target_names=['fake', 'correct']))

In [None]:
print("Confusion Matrix (for MLP):\n", metrics.confusion_matrix(y_test, [round(i[0]) for i in pred]))

In [None]:
print("Accuracy: ", (6065+5955) /(6065+5955+686 + 637)) # change values accordingly

# Hybrid of LSTM & MLP:

In [None]:
# Importing the necessary packages

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

In [None]:
# Defining MLP

def create_mlp(regress=False):
    # define our MLP network
    model = Sequential()
    model.add(Dense(12, input_dim=X_train_tf.shape[1], activation='relu'))
    model.add(Dense(8, activation='relu'))
    #model.add(Dense(1, activation="sigmoid"))
    # check to see if the regression node should be added
    if regress:
        model.add(Dense(1, activation="linear"))
    # return our model
    return model

In [None]:
mlp=create_mlp()
mlp.summary()

In [None]:
# Importing the necessary packages

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

vocabulary_size = len(counter.keys())

In [None]:
# Defining LSTM

def lstm():
    model = Sequential()
    model.add(Embedding(vocabulary_size, 32, input_length=max_words))
    model.add(LSTM(100))
    #model.add(Dense(1, activation="sigmoid"))
    return model

In [None]:
lstm =lstm()
lstm.summary()

In [None]:
from tensorflow.keras.layers import concatenate

# Concatenating LSTM & MLP

combinedInput = concatenate([mlp.output, lstm.output])
x = Dense(1, activation="sigmoid")(combinedInput)
"""
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)"""
model = Model(inputs=[mlp.input, lstm.input], outputs=x)
model.summary()

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=['accuracy'])

In [None]:
history = model.fit(x=[ X_train_tf.toarray(),X_train_fin], y=y_train,validation_data = [[X_test_tf.toarray(), X_test_fin],y_test],use_multiprocessing=True ,epochs=25)

In [None]:
pd.DataFrame(history.history).to_csv("mlp&lstm")

In [None]:
# list all data in history

print(history.history.keys())
Fig, ax =plt.subplots(1,2,figsize=(10,3))

# summarize history for accuracy

ax[0].plot(history.history['accuracy'])
ax[0].plot(history.history['val_accuracy'])
ax[0].set_title('model accuracy')
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'test'], loc='upper left')

# summarize history for loss

ax[1].plot(history.history['loss'])
ax[1].plot(history.history['val_loss'])
ax[1].set_title('model loss')
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'test'], loc='upper left')
plt.tight_layout()
plt.savefig("proposed",dpi=400)

In [None]:
pred = model.predict(x=[ X_test_tf.toarray(),X_test_fin])

In [None]:
print(metrics.classification_report(y_test, [round(i[0]) for i in pred],))