In [2]:
!pip install pandas scikit-learn joblib


Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting joblib
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.1.2-cp313-cp313-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   -------------- ------------------------- 4.2/11

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the dataset
df = pd.read_csv('fake_reviews_dataset.csv')

# Label encoding (CG -> 0, OR -> 1)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# TF-IDF Vectorization of review text
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X = tfidf.fit_transform(df['text'])

# Combine text features with rating
X_combined = pd.DataFrame(X.toarray())
X_combined['rating'] = df['rating']

# Target variable
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Convert column names of X_combined to strings to avoid the error
X_combined.columns = X_combined.columns.astype(str)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model and the TF-IDF vectorizer for later use
joblib.dump(model, 'fake_review_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [6]:
import joblib

# Load the saved RandomForest model and TF-IDF vectorizer
model = joblib.load('fake_review_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [15]:
# Example review text
test_review = "very nice product it is very much comfortable as i use it"

# Step 1: Vectorize the input review (1000 features)
review_vectorized = tfidf_vectorizer.transform([test_review])

# Step 2: Convert to DataFrame to match the training format
import pandas as pd
review_vectorized_df = pd.DataFrame(review_vectorized.toarray())

# Step 3: Add a dummy 'rating' column (set to a reasonable value, e.g., 5)
review_vectorized_df['rating'] = 5  # Add the rating column

# Step 4: Convert all feature names to strings to avoid the error
review_vectorized_df.columns = review_vectorized_df.columns.astype(str)

# Step 5: Get model prediction (0 = Genuine, 1 = Fake)
prediction = model.predict(review_vectorized_df)[0]

# Step 6: Output the result
result = 'Fake' if prediction == 1 else 'Genuine'
print(f"Review: {test_review}\nPrediction: {result}")



Review: very nice product it is very much comfortable as i use it
Prediction: Genuine


In [10]:
# List of reviews for testing
test_reviews = [
    "This product is amazing! I love it.",
    "Terrible product, not worth the money!",
    "Best product I've ever used. Highly recommend.",
    "This review seems very fake, don't trust it.",
    "The delivery was delayed but the product is worth it."
]

# Step 1: Vectorize the reviews (1000 features)
reviews_vectorized = tfidf_vectorizer.transform(test_reviews)

# Step 2: Convert to DataFrame to match the training format
reviews_vectorized_df = pd.DataFrame(reviews_vectorized.toarray())

# Step 3: Add a dummy 'rating' column (set to a reasonable value, e.g., 5)
reviews_vectorized_df['rating'] = 5  # Add the rating column

# Step 4: Convert all feature names to strings to avoid the error
reviews_vectorized_df.columns = reviews_vectorized_df.columns.astype(str)

# Step 5: Get predictions for each review
predictions = model.predict(reviews_vectorized_df)

# Step 6: Display results for each review
for review, pred in zip(test_reviews, predictions):
    result = 'Fake' if pred == 1 else 'Genuine'
    print(f"Review: {review}\nPrediction: {result}\n")


Review: This product is amazing! I love it.
Prediction: Genuine

Review: Terrible product, not worth the money!
Prediction: Fake

Review: Best product I've ever used. Highly recommend.
Prediction: Genuine

Review: This review seems very fake, don't trust it.
Prediction: Fake

Review: The delivery was delayed but the product is worth it.
Prediction: Fake



In [11]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")



Model Accuracy on Test Data: 81.14%
