In [1]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
#Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nayak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nayak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nayak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nayak\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
df = pd.read_csv('Preprocessed_OSF_Review_ds.csv')


In [4]:
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# Ensure text_ is a string and handle NaN values
df['text_'] = df['text_'].astype(str).fillna("")

# Add text length feature
df['length'] = df['text_'].apply(len)

# Add word count feature
df['word_count'] = df['text_'].apply(lambda x: len(x.split()))


In [6]:
# Improved text processing function
def enhanced_text_process(text):
    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation and stopwords but KEEP some crucial words for sentiment analysis
    # Keep words like "amazing", "great", "best", "worst", etc.
    sentiment_words = ['amazing', 'great', 'best', 'good', 'excellent', 'wonderful', 'bad', 
                       'worst', 'terrible', 'poor', 'love', 'hate', 'awesome', 'horrible']
    
    filtered_tokens = []
    for word in tokens:
        # Keep word if it's not punctuation and either not a stopword or a sentiment word
        if (word not in string.punctuation and 
            (word not in stopwords.words('english') or word.lower() in sentiment_words)):
            filtered_tokens.append(word)
    
    return filtered_tokens


In [7]:
# Create additional features
def add_features(df):
    # Create features from text characteristics that might help detect fake reviews
    df['exclamation_count'] = df['text_'].apply(lambda x: x.count('!'))
    df['question_count'] = df['text_'].apply(lambda x: x.count('?'))
    df['uppercase_ratio'] = df['text_'].apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
    )
    # Calculate average word length as a feature
    df['avg_word_length'] = df['text_'].apply(
        lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0
    )
    return df

In [8]:
# Add the new features
df = add_features(df)

# Split data into features (X) and target (y)
X = df['text_']
y = df['label']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [9]:
# Define parameters for SVC
svc_params = {
    'C': 10.0,               # Regularization parameter
    'kernel': 'linear',      # Kernel type
    'gamma': 'scale',        # Kernel coefficient
    'probability': True      # Enable probability estimates
}

In [10]:
# Improved pipeline with custom text processing
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=enhanced_text_process, 
                           max_features=50000,    # Increase vocabulary size
                           ngram_range=(1, 3))),  # Include bigrams and trigrams
    ('tfidf', TfidfTransformer(use_idf=True,      # Use inverse document frequency
                              smooth_idf=True)),  # Smooth IDF weights
    ('classifier', SVC(**svc_params))             # Use optimized SVC
])

In [11]:
# Train the model
pipeline.fit(X_train, y_train)
print("Model training complete.")



Model training complete.


In [12]:
# Function to predict if a review is fake or real   explanation 
def predict_review(review_text, pipeline=pipeline):
    # Make prediction with probability
    prediction = pipeline.predict([review_text])[0]
    probabilities = pipeline.predict_proba([review_text])[0]
    
    # Get confidence score for the prediction
    confidence = probabilities[0] if prediction == "CG" else probabilities[1]
    
    # Analyze the review characteristics
    review_length = len(review_text)
    word_count = len(review_text.split())
    exclamation_count = review_text.count('!')
    
    # Logic for very short and generic reviews
    is_short = word_count < 5
    is_generic = any(word in review_text.lower() for word in ['amazing', 'great', 'good', 'excellent', 'best'])
    
    # Adjust prediction for very short, generic, enthusiastic reviews (likely real but suspicious)
    if is_short and is_generic and exclamation_count > 0:
        if confidence < 0.85:  # If confidence is not very high
            prediction = "OR"  # Change to Original
            confidence = 1 - confidence  # Adjust confidence
    
    # Provide rationale
    rationale = []
    if is_short:
        rationale.append("Review is very short")
    if is_generic:
        rationale.append("Contains generic positive terms")
    if exclamation_count > 0:
        rationale.append(f"Contains {exclamation_count} exclamation marks")
    
    # Return results as a dictionary
    return {
        "prediction": prediction,
        "confidence": round(float(confidence), 4),
        "review": review_text,
        "characteristics": {
            "length": review_length,
            "word_count": word_count,
            "exclamation_count": exclamation_count,
            "is_short": is_short,
            "is_generic": is_generic
        },
        "rationale": rationale
    }

In [13]:
# Test the model with the sample review
sample_review = "This product is amazing!"
result = predict_review(sample_review)
print(result)

{'prediction': 'OR', 'confidence': 0.2784, 'review': 'This product is amazing!', 'characteristics': {'length': 24, 'word_count': 4, 'exclamation_count': 1, 'is_short': True, 'is_generic': True}, 'rationale': ['Review is very short', 'Contains generic positive terms', 'Contains 1 exclamation marks']}


In [14]:
# Evaluate the model on the test set
y_pred = pipeline.predict(X_test)
print("\nModel Evaluation:")
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Model Prediction Accuracy:', f"{np.round(accuracy_score(y_test, y_pred)*100, 2)}%")



Model Evaluation:
Classification Report:
               precision    recall  f1-score   support

          CG       0.85      0.88      0.87      7121
          OR       0.87      0.85      0.86      7031

    accuracy                           0.86     14152
   macro avg       0.86      0.86      0.86     14152
weighted avg       0.86      0.86      0.86     14152

Confusion Matrix:
 [[6253  868]
 [1073 5958]]
Accuracy Score: 0.8628462408140192
Model Prediction Accuracy: 86.28%


In [15]:
# Function to explain predictions for multiple example reviews
def test_multiple_reviews():
    examples = [
        "This product is amazing!",
        "I absolutely love this item, it changed my life and solved all my problems!",
        "Good quality product, arrived on time. Works as described.",
        "Terrible product. Broke after two uses. Would not recommend.",
        "The delivery was fast, packaging was good. Haven't tried the product yet.",
        "This is the best purchase I have ever made in my entire life!!!!!"
    ]
    
    results = []
    for example in examples:
        results.append(predict_review(example))
    
    # Display results in a readable format
    for i, result in enumerate(results):
        print(f"\nExample {i+1}: \"{examples[i]}\"")
        print(f"Prediction: {result['prediction']} (Confidence: {result['confidence']})")
        print(f"Characteristics: {', '.join(result['rationale'])}")
    
    return results



In [16]:
# Run tests on multiple examples
test_multiple_reviews()


Example 1: "This product is amazing!"
Prediction: OR (Confidence: 0.2784)
Characteristics: Review is very short, Contains generic positive terms, Contains 1 exclamation marks

Example 2: "I absolutely love this item, it changed my life and solved all my problems!"
Prediction: OR (Confidence: 0.6795)
Characteristics: Contains 1 exclamation marks

Example 3: "Good quality product, arrived on time. Works as described."
Prediction: CG (Confidence: 0.7284)
Characteristics: Contains generic positive terms

Example 4: "Terrible product. Broke after two uses. Would not recommend."
Prediction: CG (Confidence: 0.5522)
Characteristics: 

Example 5: "The delivery was fast, packaging was good. Haven't tried the product yet."
Prediction: OR (Confidence: 0.8681)
Characteristics: Contains generic positive terms

Example 6: "This is the best purchase I have ever made in my entire life!!!!!"
Prediction: OR (Confidence: 0.8522)
Characteristics: Contains generic positive terms, Contains 5 exclamation mar

[{'prediction': 'OR',
  'confidence': 0.2784,
  'review': 'This product is amazing!',
  'characteristics': {'length': 24,
   'word_count': 4,
   'exclamation_count': 1,
   'is_short': True,
   'is_generic': True},
  'rationale': ['Review is very short',
   'Contains generic positive terms',
   'Contains 1 exclamation marks']},
 {'prediction': 'OR',
  'confidence': 0.6795,
  'review': 'I absolutely love this item, it changed my life and solved all my problems!',
  'characteristics': {'length': 75,
   'word_count': 14,
   'exclamation_count': 1,
   'is_short': False,
   'is_generic': False},
  'rationale': ['Contains 1 exclamation marks']},
 {'prediction': 'CG',
  'confidence': 0.7284,
  'review': 'Good quality product, arrived on time. Works as described.',
  'characteristics': {'length': 58,
   'word_count': 9,
   'exclamation_count': 0,
   'is_short': False,
   'is_generic': True},
  'rationale': ['Contains generic positive terms']},
 {'prediction': 'CG',
  'confidence': 0.5522,
  're

In [19]:
# Save the trained model
import pickle
model_path = "C:/Users/nayak/FakeReviewAnalyzer/backend/model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(pipeline, f)
print(f"Model saved to {model_path}")

Model saved to C:/Users/nayak/FakeReviewAnalyzer/backend/model.pkl
