In [1]:
import numpy as np

In [2]:
from nltk import word_tokenize

In [3]:
from transformers import TrainingArguments
import torch


In [4]:
# Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Specify the new directory for saving results
new_output_dir = 'C:/Users/91894/OneDrive/Desktop/fake review/Results'
new_logging_dir = 'C:/Users/91894/OneDrive/Desktop/fake review/logs'


In [6]:

training_args = TrainingArguments(
    output_dir="./Results",  # Use a relative path instead of C:/Users...
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,
    logging_steps=1000,  # Reduce logging frequency
    save_steps=2000,  # Save checkpoints less often
    save_total_limit=1,  # Keep only 1 checkpoint
    fp16=True if torch.cuda.is_available() else False,  # Use mixed precision if GPU is available
    report_to="none"  # Disable unnecessary logging (e.g., WandB)
)

In [7]:
new_output_dir = 'C:/Users/91894/OneDrive/Desktop/fake review/Results'
new_logging_dir = 'C:/Users/91894/OneDrive/Desktop/fake review/logs'

In [8]:
import os

# Set a new working directory for the notebook
os.chdir('C:/Users/91894/OneDrive/Desktop/fake review')


### Classification using deep learning

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SimpleRNN
from tensorflow.keras.layers import GlobalAveragePooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

 #### load data and then encode the labels into numeric values using LabelEncoder


In [10]:
data = pd.read_csv("C:/Users/91894/OneDrive/Desktop/fake review/TP_DS.csv", nrows=10000)

data['label_encoded'] = LabelEncoder().fit_transform(data['label'])
import gc
gc.collect()


20

#### split the data into training and testing sets, tokenize the text, pad the sequences, and build and train three types of neural network models: RNN, LSTM, and BiLSTM for binary classification

In [11]:
# splits the data into training and testing sets
X = data['cleaned_text']
y = data['label_encoded']
X = data['cleaned_text'].astype(str)  # Convert to string to handle any float or NaN issues
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Text Preprocessing with Tokenization and Padding
vocab_size = 10000
max_length = 100
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

#model building
# Define a function to build models
def build_model(model_type="RNN"):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))

    if model_type == "RNN":
        model.add(SimpleRNN(64, return_sequences=False))
    elif model_type == "LSTM":
        model.add(LSTM(64, return_sequences=False))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(64, return_sequences=False)))
        
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(), Recall()])
    return model

# Training  of model
def train_and_evaluate(model_type):
    model = build_model(model_type)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(X_train_pad, y_train, 
                        epochs=10, 
                        batch_size=64, 
                        validation_split=0.2,
                        callbacks=[early_stopping])
    
    # Evaluation of model
    y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
    print(f"Classification Report for {model_type}:")
    print(classification_report(y_test, y_pred, target_names=['CG', 'OR']))
    
# loop through Train and evaluate RNN, LSTM, and BiLSTM models
for model_type in ["RNN", "LSTM", "BiLSTM"]:
    train_and_evaluate(model_type)

Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.5758 - loss: 0.6692 - precision: 0.5888 - recall: 0.4638 - val_accuracy: 0.7256 - val_loss: 0.5602 - val_precision: 0.6722 - val_recall: 0.8966
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.7993 - loss: 0.4690 - precision: 0.7719 - recall: 0.8394 - val_accuracy: 0.8175 - val_loss: 0.4274 - val_precision: 0.7928 - val_recall: 0.8670
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.8888 - loss: 0.2998 - precision: 0.8695 - recall: 0.9130 - val_accuracy: 0.8150 - val_loss: 0.4378 - val_precision: 0.8404 - val_recall: 0.7845
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.8894 - loss: 0.3015 - precision: 0.8933 - recall: 0.8791 - val_accuracy: 0.7775 - val_loss: 0.5161 - val_precision: 0.9254 - val_recall: 0.6108
Epoch 5/10
[1m100/100[0m [32



[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.5116 - loss: 0.6932 - precision_1: 0.5220 - recall_1: 0.4293 - val_accuracy: 0.5369 - val_loss: 0.6703 - val_precision_1: 0.8447 - val_recall_1: 0.1071
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.5351 - loss: 0.7126 - precision_1: 0.5840 - recall_1: 0.3231 - val_accuracy: 0.5225 - val_loss: 0.6771 - val_precision_1: 0.9444 - val_recall_1: 0.0628
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.5390 - loss: 0.6704 - precision_1: 0.7997 - recall_1: 0.1203 - val_accuracy: 0.5238 - val_loss: 0.6759 - val_precision_1: 0.9310 - val_recall_1: 0.0665
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 0.5164 - loss: 0.6695 - precision_1: 0.5427 - recall_1: 0.5213 - val_accuracy: 0.5144 - val_loss: 0.6979 - val_precision_1: 0.8723 - val_recall_1: 0.0505



[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - accuracy: 0.6341 - loss: 0.6112 - precision_2: 0.6603 - recall_2: 0.5509 - val_accuracy: 0.8581 - val_loss: 0.3198 - val_precision_2: 0.8707 - val_recall_2: 0.8461
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 56ms/step - accuracy: 0.9049 - loss: 0.2220 - precision_2: 0.9082 - recall_2: 0.9003 - val_accuracy: 0.8769 - val_loss: 0.3021 - val_precision_2: 0.8764 - val_recall_2: 0.8818
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 67ms/step - accuracy: 0.9439 - loss: 0.1482 - precision_2: 0.9463 - recall_2: 0.9434 - val_accuracy: 0.8825 - val_loss: 0.3027 - val_precision_2: 0.9021 - val_recall_2: 0.8621
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 64ms/step - accuracy: 0.9672 - loss: 0.0986 - precision_2: 0.9666 - recall_2: 0.9677 - val_accuracy: 0.8819 - val_loss: 0.3225 - val_precision_2: 0.9030 - val_recall_2: 0.8596

## Model Training, Prediction, and Evaluation Using BERT for Sequence Classification

### fine-tune a BERT model for sequence classification using the Transformers library by Hugging Face

In [12]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install transformers[torch]




In [14]:
from transformers import TrainingArguments
import torch

In [16]:

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
device = torch.device("cpu")  # Force CPU usage

# Set device to CPU only
device = torch.device("cpu")

# Print the device being used
print(f"Using device: {device}")

# Load dataset and preprocess
data = pd.read_csv("C:/Users/91894/OneDrive/Desktop/fake review/TP_DS.csv", nrows=1000)   # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split dataset
X = data['cleaned_text'].astype(str)
y = data['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class to handle text and labels
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0).to(device),
            'attention_mask': encoding['attention_mask'].squeeze(0).to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

# Create dataset and data loaders
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
test_dataset = ReviewDataset(X_test, y_test, tokenizer)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(fp16=True, 
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,  # Lower batch size for CPU
    per_device_eval_batch_size=4,  # Lower batch size for CPU
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,  # Save only the last two checkpoints
    save_strategy='epoch',  # Save model at the end of each epoch
    disable_tqdm=False,  # Enable progress bar in console
)

# Define Trainer for training BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate on test set
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

# Generate classification report
print(classification_report(y_test, pred_labels, target_names=['CG', 'OR']))

import gc
gc.collect()


Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.66766
2,No log,0.893672
3,0.506800,1.338719


              precision    recall  f1-score   support

          CG       0.75      0.80      0.77       104
          OR       0.76      0.71      0.74        96

    accuracy                           0.76       200
   macro avg       0.76      0.75      0.75       200
weighted avg       0.76      0.76      0.75       200



33

In [18]:
## saving  trained model, tokenizer, and training arguments
import os
import json

# Define a directory to save the model and tokenizer
model_dir = 'BERT./saved_model'
os.makedirs(model_dir, exist_ok=True)

# Save the trained BERT model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

# Save training arguments as a JSON file
with open(os.path.join(model_dir, 'training_args.json'), 'w') as f:
    json.dump(training_args.to_dict(), f)

print(f"Model, tokenizer, and training arguments saved in {model_dir}")

Model, tokenizer, and training arguments saved in BERT./saved_model


### Conclusion:
#### The model is highly effective at detecting fake reviews with strong performance across both classes. With an accuracy of 76%, it strikes a good balance between precision and recall, showing that it is both accurate and reliable in classifying reviews as genuine or fake. This result is promising for the task of fake review detection.