In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('/content/pfr.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,category,rating,label,text_
0,0,Home_and_Kitchen_5,5,CG,love well made sturdi comfort i love veri pretti
1,1,Home_and_Kitchen_5,5,CG,love great upgrad origin i 've mine coupl year
2,2,Home_and_Kitchen_5,5,CG,thi pillow save back i love look feel pillow
3,3,Home_and_Kitchen_5,1,CG,miss inform use great product price i
4,4,Home_and_Kitchen_5,5,CG,veri nice set good qualiti we set two month


In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.dropna(inplace=True)
df['length'] = df['text_'].apply(len)

In [None]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
review_train, review_test, label_train, label_test = train_test_split(df['text_'],df['label'],test_size=0.35)

In [None]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [None]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
pipeline.fit(review_train,label_train)

In [None]:
svc_pred = pipeline.predict(review_test)

In [None]:
# Filter the reviews that have been detected as fake
fake_reviews = review_test[svc_pred == 'CG']

# Print the fake reviews along with their true labels
for review, true_label in zip(fake_reviews, label_test[svc_pred == 'CG']):
    print("Review:", review)
    print("-----------------------")

Review: great comfort especi go i keep shelv order
-----------------------
Review: these sturdi snap place comfort the reason i gave
-----------------------
Review: well receiv comfort gave gift dialysi patient
-----------------------
Review: veri handi one kid tool includ packag i one
-----------------------
Review: thi mug veri nice i love look feel size
-----------------------
Review: not i accustom the reason i gave star i
-----------------------
Review: tini spatula unless use profession spatula need make sure
-----------------------
Review: great open look nice countertop i keep open drawer
-----------------------
Review: love great upgrad origin i 've mine coupl year
-----------------------
Review: bought friend love
-----------------------
Review: my wife i put togeth rather quickli look realli good
-----------------------
Review: these sheet thin wide open i purchas differ size
-----------------------
Review: total work i use everyday it easi clean i
-----------------------
Re

In [None]:
# Create a DataFrame for the predicted labels
predicted_df = pd.DataFrame({
    'text_': review_test,
    'predicted_label': svc_pred
})

# Filter the DataFrame for fake reviews
fake_reviews_df = df[df['text_'].isin(predicted_df[predicted_df['predicted_label'] == 'CG']['text_'])]

# Save the filtered DataFrame to a new CSV file
fake_reviews_df.to_csv('fake_reviews.csv', index=False)


In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Read the dataset
df = pd.read_csv('/content/fake_reviews.csv')

In [None]:
# Function to detect fake reviews based on keywords count
def rule_based_detection(review, keywords):
    count = 0
    for word in review.split():
        if word.lower() in keywords:
            count += 1
    return count > 2  # Adjust the threshold as needed

In [None]:
# Define keywords indicating fake reviews
fake_keywords = ['not', 'disappointed', 'waste', 'terrible', 'poor', 'avoid', 'horrible', 'worst', 'cheap', 'junk',
                 'trash', 'awful', 'terrible', 'useless', 'disappointing', 'bad', 'defective', 'ruined', 'flimsy',
                 'garbage', 'unsatisfactory', 'shoddy', 'faulty', 'disgusting', 'regret', 'unsatisfied', 'crap',
                 'rubbish', 'deceptive', 'subpar', 'overpriced', 'inferior', 'unusable', 'disappoint', 'lies',
                 'displeased', 'stupid', 'not happy', 'shameful', 'unsatisfying', 'sucks', 'hate', 'unreliable',
                 'unacceptable', 'fail', 'lousy', 'poorly', 'dissatisfied']

In [None]:
# Apply rule-based detection to the DataFrame
df['fake_by_rule'] = df['text_'].apply(lambda x: rule_based_detection(x, fake_keywords))

In [None]:
# Filter fake reviews detected by rule-based method
fake_reviews_rule_based = df[df['fake_by_rule']]

In [None]:
# Feature extraction
X = df[['length']]

In [None]:
# Train anomaly detection model
anomaly_detector = IsolationForest(contamination=0.1)  # Adjust contamination as needed
anomaly_detector.fit(X)

In [None]:
# Predict anomalies
df['anomaly_score'] = anomaly_detector.decision_function(X)
df['anomaly'] = anomaly_detector.predict(X)

In [None]:
# Filter anomalies
anomalies_df = df[df['anomaly'] == -1]

In [None]:
# Combine both rule-based and anomaly-detected fake reviews
final_fake_reviews = pd.concat([fake_reviews_rule_based, anomalies_df])

In [None]:
# Save to CSV
final_fake_reviews.to_csv('final_fake_reviews.csv', index=False)