In [1]:
from transformers import TrainingArguments

# Specify the new directory for saving results
new_output_dir = 'E:/Final Project/Results'
new_logging_dir = 'E:/Final Project/logs'

training_args = TrainingArguments(
    output_dir=new_output_dir,  # Directory to save the model checkpoints
    logging_dir=new_logging_dir,  # Directory to save logs
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=500,  # Interval for logging
    save_steps=500,  # Interval for saving checkpoints
    save_total_limit=2  # Limit the number of checkpoints to keep
)





In [8]:
new_output_dir = 'E:/Final Project/Results'
new_logging_dir = 'E:/Final Project/logs'

In [10]:
import os

# Set a new working directory for the notebook
os.chdir('E:/Final Project')


### Classification using deep learning

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SimpleRNN
from tensorflow.keras.layers import GlobalAveragePooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

 #### load data and then encode the labels into numeric values using LabelEncoder

In [18]:
data = pd.read_csv(r"C:\Users\DURGA\Desktop\final_project_fake_review\dataset\TP_DS.csv")  
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

#### split the data into training and testing sets, tokenize the text, pad the sequences, and build and train three types of neural network models: RNN, LSTM, and BiLSTM for binary classification

In [21]:
# splits the data into training and testing sets
X = data['cleaned_text']
y = data['label_encoded']
X = data['cleaned_text'].astype(str)  # Convert to string to handle any float or NaN issues
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Text Preprocessing with Tokenization and Padding
vocab_size = 10000
max_length = 100
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

#model building
# Define a function to build models
def build_model(model_type="RNN"):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))

    if model_type == "RNN":
        model.add(SimpleRNN(64, return_sequences=False))
    elif model_type == "LSTM":
        model.add(LSTM(64, return_sequences=False))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(64, return_sequences=False)))
        
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(), Recall()])
    return model

# Training  of model
def train_and_evaluate(model_type):
    model = build_model(model_type)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(X_train_pad, y_train, 
                        epochs=10, 
                        batch_size=64, 
                        validation_split=0.2,
                        callbacks=[early_stopping])
    
    # Evaluation of model
    y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
    print(f"Classification Report for {model_type}:")
    print(classification_report(y_test, y_pred, target_names=['CG', 'OR']))
    
# loop through Train and evaluate RNN, LSTM, and BiLSTM models
for model_type in ["RNN", "LSTM", "BiLSTM"]:
    train_and_evaluate(model_type)



Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 52ms/step - accuracy: 0.5658 - loss: 0.6750 - precision: 0.5720 - recall: 0.5483 - val_accuracy: 0.7196 - val_loss: 0.5761 - val_precision: 0.7077 - val_recall: 0.7518
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 45ms/step - accuracy: 0.6518 - loss: 0.6034 - precision: 0.6451 - recall: 0.6594 - val_accuracy: 0.4962 - val_loss: 0.6903 - val_precision: 0.4965 - val_recall: 0.2593
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 49ms/step - accuracy: 0.5585 - loss: 0.6719 - precision: 0.5650 - recall: 0.5167 - val_accuracy: 0.6987 - val_loss: 0.6022 - val_precision: 0.6844 - val_recall: 0.7419
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 57ms/step - accuracy: 0.7228 - loss: 0.5632 - precision: 0.7160 - recall: 0.7431 - val_accuracy: 0.7347 - val_loss: 0.5588 - val_precision: 0.7279 - val_recall: 0.7530
Epoch 5/10
[1m4



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 82ms/step - accuracy: 0.5290 - loss: 0.6800 - precision_1: 0.5489 - recall_1: 0.3670 - val_accuracy: 0.5559 - val_loss: 0.6556 - val_precision_1: 0.9652 - val_recall_1: 0.1195
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 79ms/step - accuracy: 0.7050 - loss: 0.5720 - precision_1: 0.7162 - recall_1: 0.6910 - val_accuracy: 0.7776 - val_loss: 0.5122 - val_precision_1: 0.8343 - val_recall_1: 0.6948
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 80ms/step - accuracy: 0.7019 - loss: 0.5520 - precision_1: 0.7813 - recall_1: 0.5513 - val_accuracy: 0.8147 - val_loss: 0.4336 - val_precision_1: 0.8495 - val_recall_1: 0.7666
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 81ms/step - accuracy: 0.8248 - loss: 0.4039 - precision_1: 0.8640 - recall_1: 0.7673 - val_accuracy: 0.8264 - val_loss: 0.4181 - val_precision_1: 0.8428 - val_recall_1: 0.



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 145ms/step - accuracy: 0.7559 - loss: 0.4607 - precision_2: 0.7782 - recall_2: 0.6975 - val_accuracy: 0.9020 - val_loss: 0.2277 - val_precision_2: 0.8936 - val_recall_2: 0.9135
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 136ms/step - accuracy: 0.9221 - loss: 0.1870 - precision_2: 0.9218 - recall_2: 0.9227 - val_accuracy: 0.9059 - val_loss: 0.2123 - val_precision_2: 0.8990 - val_recall_2: 0.9153
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 136ms/step - accuracy: 0.9491 - loss: 0.1310 - precision_2: 0.9480 - recall_2: 0.9510 - val_accuracy: 0.9153 - val_loss: 0.2208 - val_precision_2: 0.9086 - val_recall_2: 0.9242
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 129ms/step - accuracy: 0.9634 - loss: 0.0955 - precision_2: 0.9663 - recall_2: 0.9606 - val_accuracy: 0.9127 - val_loss: 0.2498 - val_precision_2: 0.8944 - val_recall_2

## Model Training, Prediction, and Evaluation Using BERT for Sequence Classification

### fine-tune a BERT model for sequence classification using the Transformers library by Hugging Face

In [28]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Note: you may need to restart the kernel to use updated packages.


In [30]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [34]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Check if CUDA (GPU) is available,The model will run on GPU if available, otherwise on CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used (CPU or GPU)
print(f"Using device: {device}")

# Load dataset and preprocess
data = pd.read_csv(r"C:\Users\DURGA\Desktop\final_project_fake_review\dataset\TP_DS.csv")  # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split dataset
X = data['cleaned_text'].astype(str)
y = data['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
#Converts the text into token IDs that can be understood by the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class to handle text and labels
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):#Limits the length of the input text to 128 tokens
        self.texts = texts                                       #You can adjust this depending on the dataset and model requirements.
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

# Create dataset and data loaders
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
test_dataset = ReviewDataset(X_test, y_test, tokenizer)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define Trainer for training BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train model
trainer.train()

# Evaluate on test set
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

from sklearn.metrics import classification_report
print(classification_report(y_test, pred_labels, target_names=['CG', 'OR']))

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3692,0.41214
2,0.2609,0.29017
3,0.1771,0.2911


              precision    recall  f1-score   support

          CG       0.92      0.93      0.93      4016
          OR       0.93      0.92      0.93      4071

    accuracy                           0.93      8087
   macro avg       0.93      0.93      0.93      8087
weighted avg       0.93      0.93      0.93      8087



In [37]:
## saving  trained model, tokenizer, and training arguments
import os
import json

# Define a directory to save the model and tokenizer
model_dir = 'BERT./saved_model'
os.makedirs(model_dir, exist_ok=True)

# Save the trained BERT model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

# Save training arguments as a JSON file
with open(os.path.join(model_dir, 'training_args.json'), 'w') as f:
    json.dump(training_args.to_dict(), f)

print(f"Model, tokenizer, and training arguments saved in {model_dir}")

Model, tokenizer, and training arguments saved in BERT./saved_model


### Conclusion:
#### The model is highly effective at detecting fake reviews with strong performance across both classes. With an accuracy of 93%, it strikes a good balance between precision and recall, showing that it is both accurate and reliable in classifying reviews as genuine or fake. This result is promising for the task of fake review detection.