In [2]:
pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import RobertaTokenizer

In [5]:
# Load dataset
df = pd.read_csv("fake reviews dataset.csv")
df = df.dropna()  # Drop missing values

# Extract relevant columns
df = df[["category", "rating", "label", "text_"]]  # Ensure these column names match your dataset

# Convert labels to binary (0 = Fake, 1 = Genuine)
df["label"] = df["label"].apply(lambda x: 0 if x.lower() == "cg" else 1)

# Concatenate category, rating, and review text for better context
df["input_text"] = df["category"] + " " + df["rating"].astype(str) + " " + df["text_"]

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["input_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [6]:
print(df.head(n=10))

             category  rating  label  \
0  Home_and_Kitchen_5     5.0      0   
1  Home_and_Kitchen_5     5.0      0   
2  Home_and_Kitchen_5     5.0      0   
3  Home_and_Kitchen_5     1.0      0   
4  Home_and_Kitchen_5     5.0      0   
5  Home_and_Kitchen_5     3.0      0   
6  Home_and_Kitchen_5     5.0      0   
7  Home_and_Kitchen_5     3.0      0   
8  Home_and_Kitchen_5     5.0      0   
9  Home_and_Kitchen_5     5.0      0   

                                               text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   
5       I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.   
6  They are the perfect touch for me and the only...   
7  These done fit well and look great.  I love th...   
8  Great big numbers & easy to read, the only thi...   

In [7]:
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels,
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_labels,
})

In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


In [10]:

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Added accuracy calculation
)




In [14]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.045,0.115461,0.980833
2,0.0354,0.085079,0.985532
3,0.0103,0.152545,0.980092


TrainOutput(global_step=12132, training_loss=0.03449810924916782, metrics={'train_runtime': 2288.2395, 'train_samples_per_second': 42.406, 'train_steps_per_second': 5.302, 'total_flos': 2.55309812568576e+16, 'train_loss': 0.03449810924916782, 'epoch': 3.0})

In [15]:
# Save the trained model
model.save_pretrained("roberta_fake_review_model")
tokenizer.save_pretrained("roberta_fake_review_model")


('roberta_fake_review_model/tokenizer_config.json',
 'roberta_fake_review_model/special_tokens_map.json',
 'roberta_fake_review_model/vocab.json',
 'roberta_fake_review_model/merges.txt',
 'roberta_fake_review_model/added_tokens.json')

In [16]:
!zip -r roberta_fake_review_model.zip roberta_fake_review_model


  adding: roberta_fake_review_model/ (stored 0%)
  adding: roberta_fake_review_model/vocab.json (deflated 68%)
  adding: roberta_fake_review_model/model.safetensors (deflated 9%)
  adding: roberta_fake_review_model/special_tokens_map.json (deflated 84%)
  adding: roberta_fake_review_model/tokenizer_config.json (deflated 76%)
  adding: roberta_fake_review_model/merges.txt (deflated 53%)
  adding: roberta_fake_review_model/config.json (deflated 51%)
