In [None]:
# Product Review Rating Predictor - Complete Implementation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords', quiet=True)

# Step 1: Create realistic Product Review Dataset (Amazon/Flipkart style)
def create_product_review_dataset():
    positive_reviews = [
        "This product is absolutely amazing! Exceeded my expectations completely.",
        "Outstanding quality and fast delivery. Highly recommend to everyone.",
        "Love this product! Works perfectly and great value for money.",
        "Excellent build quality and very user-friendly. Five stars!",
        "Best purchase I've made in years. Superior quality and performance.",
        "Fantastic product with amazing features. Worth every penny spent.",
        "Perfect product! Exactly what I was looking for. Highly satisfied.",
        "Amazing quality and performance. Will definitely buy again from this seller.",
        "Superb product with excellent build quality. Highly recommended for everyone.",
        "Outstanding performance and great value. Will recommend to friends and family."
    ] * 200  # 2000 samples

    neutral_reviews = [
        "The product is okay, nothing special but does the job adequately.",
        "Average quality product. Works fine but could be better for the price.",
        "It's an okay product. Not great but not terrible either. Average.",
        "The product works as expected but nothing extraordinary about it.",
        "Decent quality but there are better alternatives available in market.",
        "Product is fine but packaging could have been better for protection.",
        "It does what it's supposed to do. Average product for the price.",
        "The product is acceptable but I expected more for this price range.",
        "Average product quality. Works fine but nothing to write home about.",
        "It's a decent product but customer service could be more responsive."
    ] * 100  # 1000 samples

    negative_reviews = [
        "Terrible product! Complete waste of money. Would not recommend at all.",
        "Poor quality and doesn't work properly. Very disappointed with this purchase.",
        "Awful product with cheap materials. Broke within a week of usage.",
        "Completely useless product. Nothing works as advertised. Avoid this seller.",
        "Very poor quality and terrible customer service. Regret buying this completely.",
        "Product is garbage. Cheap build quality and doesn't function properly at all.",
        "Worst purchase ever made. Product broke immediately after first use.",
        "Terrible quality and slow delivery. Product doesn't match the description given.",
        "Completely disappointed with this product. Poor quality and overpriced for features.",
        "Horrible product with many defects. Customer service is also very unhelpful."
    ] * 150  # 1500 samples

    labels = ['Positive'] * len(positive_reviews) + ['Neutral'] * len(neutral_reviews) + ['Negative'] * len(negative_reviews)
    reviews = positive_reviews + neutral_reviews + negative_reviews

    data = pd.DataFrame({'review': reviews, 'sentiment': labels})
    return data.sample(frac=1).reset_index(drop=True)

# Load dataset
print("Creating Product Review dataset...")
review_data = create_product_review_dataset()
print(f"Dataset created with {len(review_data)} reviews")
print(f"Sentiment distribution:\n{review_data['sentiment'].value_counts()}")

# Step 2: Text Preprocessing (Tokenization, Stopword Removal)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_review(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    processed_tokens = [stemmer.stem(token) for token in tokens
                       if token not in stop_words and len(token) > 2]
    return ' '.join(processed_tokens)

review_data['processed_review'] = review_data['review'].apply(preprocess_review)
print("Text preprocessing completed!")

# Step 3: Convert to TF-IDF Features
X = review_data['processed_review']
y = review_data['sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)}, Test set: {len(X_test)}")

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")

# Step 4: Train Classifiers
print("\nTraining models...")

# Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000, multi_class='ovr')
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_test_tfidf)

# SVM Classifier
svm_model = SVC(random_state=42, kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)

# Step 5: Evaluation
print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

# Macro-averaged F1 scores
lr_macro_f1 = f1_score(y_test, lr_predictions, average='macro')
svm_macro_f1 = f1_score(y_test, svm_predictions, average='macro')

print(f"Logistic Regression Macro-averaged F1 Score: {lr_macro_f1:.4f}")
print(f"SVM Macro-averaged F1 Score: {svm_macro_f1:.4f}")

# Classification Reports
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))

print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# Calculate class-wise accuracy for bar chart
def calculate_class_wise_accuracy(y_true, y_pred):
    classes = sorted(y_true.unique())
    class_accuracies = {}

    for cls in classes:
        # Get indices for this class
        class_indices = y_true == cls
        # Calculate accuracy for this class
        class_true = y_true[class_indices]
        class_pred = y_pred[class_indices]
        accuracy = accuracy_score(class_true, class_pred)
        class_accuracies[cls] = accuracy

    return class_accuracies

lr_class_acc = calculate_class_wise_accuracy(y_test, lr_predictions)
svm_class_acc = calculate_class_wise_accuracy(y_test, svm_predictions)

# Create Bar Chart for Class-wise Accuracy Comparison
plt.figure(figsize=(12, 8))

classes = ['Negative', 'Neutral', 'Positive']
lr_accuracies = [lr_class_acc[cls] for cls in classes]
svm_accuracies = [svm_class_acc[cls] for cls in classes]

x = np.arange(len(classes))
width = 0.35

plt.subplot(2, 1, 1)
bars1 = plt.bar(x - width/2, lr_accuracies, width, label='Logistic Regression',
                color='skyblue', alpha=0.8)
bars2 = plt.bar(x + width/2, svm_accuracies, width, label='SVM',
                color='lightcoral', alpha=0.8)

plt.xlabel('Sentiment Classes')
plt.ylabel('Class-wise Accuracy')
plt.title('Class-wise Accuracy Comparison - Product Review Classification')
plt.xticks(x, classes)
plt.legend()
plt.ylim(0, 1.1)

# Add value labels on bars
for i, (lr_acc, svm_acc) in enumerate(zip(lr_accuracies, svm_accuracies)):
    plt.text(i - width/2, lr_acc + 0.02, f'{lr_acc:.3f}', ha='center', va='bottom')
    plt.text(i + width/2, svm_acc + 0.02, f'{svm_acc:.3f}', ha='center', va='bottom')

# Confusion Matrix for Best Model
plt.subplot(2, 1, 2)
best_model = "Logistic Regression" if lr_macro_f1 > svm_macro_f1 else "SVM"
best_predictions = lr_predictions if lr_macro_f1 > svm_macro_f1 else svm_predictions

cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=classes, yticklabels=classes)
plt.title(f'Confusion Matrix - {best_model} (Best Model)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('product_review_classification_results.png', dpi=300, bbox_inches='tight')
plt.show()

# Summary Statistics Table
results_summary = pd.DataFrame({
    'Model': ['Logistic Regression', 'SVM'],
    'Macro_F1_Score': [lr_macro_f1, svm_macro_f1],
    'Negative_Accuracy': [lr_class_acc['Negative'], svm_class_acc['Negative']],
    'Neutral_Accuracy': [lr_class_acc['Neutral'], svm_class_acc['Neutral']],
    'Positive_Accuracy': [lr_class_acc['Positive'], svm_class_acc['Positive']],
    'Overall_Accuracy': [accuracy_score(y_test, lr_predictions),
                        accuracy_score(y_test, svm_predictions)]
})

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(results_summary.round(4))

# Feature Importance Analysis (for Logistic Regression)
print("\n" + "="*50)
print("TOP FEATURES BY CLASS (Logistic Regression)")
print("="*50)

feature_names = tfidf_vectorizer.get_feature_names_out()
classes = lr_model.classes_

for i, class_name in enumerate(classes):
    coefficients = lr_model.coef_[i]
    top_indices = coefficients.argsort()[-10:][::-1]  # Top 10 features

    print(f"\nTop features for {class_name}:")
    for idx in top_indices:
        print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

# Save results
results_summary.to_csv('product_review_classification_results.csv', index=False)
print(f"\nResults saved to 'product_review_classification_results.csv'")
print(f"\nDataset Details:")
print(f"- Total Reviews: {len(review_data)}")
print(f"- Features Generated: {X_train_tfidf.shape[1]} TF-IDF features")
print(f"- Best Model: {best_model} (Macro F1: {max(lr_macro_f1, svm_macro_f1):.4f})")
