In [None]:
# ========== PRODUCT RECOMMENDATION ENGINE - GOOGLE COLAB ==========
# 🔗 Click "Copy to Drive" to save your own copy

# ==================== SETUP ====================
!pip install scikit-learn surprise tensorflow
!pip install pandas numpy matplotlib seaborn
!pip install fastapi uvicorn -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive, files

drive.mount('/content/drive')

# ==================== RECOMMENDATION SYSTEM ====================
class RecommendationEngine:
    """Hybrid recommendation engine with multiple algorithms"""

    def __init__(self):
        self.user_item_matrix = None
        self.item_similarity = None
        self.user_similarity = None
        self.rf_model = None
        self.neural_model = None
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.metrics = {}

    def generate_sample_data(self, n_users=1000, n_items=500, n_interactions=50000):
        """Generate sample e-commerce data"""
        print("🔄 Generating sample e-commerce data...")

        np.random.seed(42)

        # Users data
        users = pd.DataFrame({
            'user_id': range(1, n_users + 1),
            'age': np.random.randint(18, 70, n_users),
            'gender': np.random.choice(['M', 'F'], n_users),
            'location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_users),
            'income': np.random.uniform(30000, 150000, n_users)
        })

        # Items data
        categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports', 'Beauty']
        items = pd.DataFrame({
            'item_id': range(1, n_items + 1),
            'category': np.random.choice(categories, n_items),
            'price': np.random.uniform(10, 500, n_items),
            'rating': np.random.uniform(3.0, 5.0, n_items),
            'popularity': np.random.randint(1, 100, n_items)
        })

        # Interactions data
        interactions = []
        for _ in range(n_interactions):
            user_id = np.random.randint(1, n_users + 1)
            item_id = np.random.randint(1, n_items + 1)
            rating = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.3, 0.4, 0.15])
            timestamp = pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 365))

            interactions.append({
                'user_id': user_id,
                'item_id': item_id,
                'rating': rating,
                'timestamp': timestamp
            })

        interactions_df = pd.DataFrame(interactions)

        # Remove duplicates
        interactions_df = interactions_df.drop_duplicates(subset=['user_id', 'item_id'])

        print(f"✅ Generated data:")
        print(f"   • {len(users)} users")
        print(f"   • {len(items)} items") = np.random.choice(df.index, size=int(n_rows * 0.1), replace=False)
        for idx in null_indices:
            col = np.random.choice(['name', 'email', 'age', 'salary'])
            df.loc[idx, col] = None

        # 3. Invalid email format (5%)
        invalid_email_indices = np.random.choice(df.index, size=int(n_rows * 0.05), replace=False)
        df.loc[invalid_email_indices, 'email'] = 'invalid_email_format'

        # 4. Outliers in salary (3%)
        outlier_indices = np.random.choice(df.index, size=int(n_rows * 0.03), replace=False)
        df.loc[outlier_indices, 'salary'] = np.random.uniform(1000000, 5000000, len(outlier_indices))

        # 5. Negative ages (2%)
        neg_age_indices = np.random.choice(df.index, size=int(n_rows * 0.02), replace=False)
        df.loc[neg_age_indices, 'age'] = np.random.randint(-10, 0, len(neg_age_indices))

        # 6. Inconsistent product categories (5%)
        inconsistent_indices = np.random.choice(df.index, size=int(n_rows * 0.05), replace=False)
        df.loc[inconsistent_indices, 'product_category'] = np.random.choice(['XYZ', 'ABC', '123'], len(inconsistent_indices))

        # Shuffle the data
        df = df.sample(frac=1).reset_index(drop=True)

        print(f"✅ Generated {len(df)} rows with intentional quality issues")
        return df

    def check_completeness(self, df):
        """Check for missing values"""
        print("🔍 Checking data completeness...")

        missing_data = df.isnull().sum()
        missing_percentage = (missing_data / len(df)) * 100

        completeness_report = pd.DataFrame({
            'Column': df.columns,
            'Missing_Count': missing_data.values,
            'Missing_Percentage': missing_percentage.values
        })

        print("Missing Data Summary:")
        print(completeness_report)

        # Flag columns with >5% missing data
        high_missing = completeness_report[completeness_report['Missing_Percentage'] > 5]
        if not high_missing.empty:
            self.issues_found.extend([
                f"High missing data in {row['Column']}: {row['Missing_Percentage']:.1f}%"
                for _, row in high_missing.iterrows()
            ])
            self.rules_failed += len(high_missing)
        else:
            self.rules_passed += len(df.columns)

        return completeness_report

    def check_uniqueness(self, df):
        """Check for duplicate records"""
        print("🔍 Checking data uniqueness...")

        duplicate_count = df.duplicated().sum()
        duplicate_percentage = (duplicate_count / len(df)) * 100

        print(f"Duplicate rows found: {duplicate_count} ({duplicate_percentage:.2f}%)")

        if duplicate_count > 0:
            self.issues_found.append(f"Duplicate rows: {duplicate_count} ({duplicate_percentage:.2f}%)")
            self.rules_failed += 1
        else:
            self.rules_passed += 1

        return duplicate_count, duplicate_percentage

    def check_validity(self, df):
        """Check data validity and format"""
        print("🔍 Checking data validity...")

        validity_issues = []

        # Check email format
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        invalid_emails = ~df['email'].str.match(email_pattern, na=False)
        invalid_email_count = invalid_emails.sum()

        if invalid_email_count > 0:
            validity_issues.append(f"Invalid email format: {invalid_email_count}")

        # Check age validity
        invalid_ages = (df['age'] < 0) | (df['age'] > 120)
        invalid_age_count = invalid_ages.sum()

        if invalid_age_count > 0:
            validity_issues.append(f"Invalid ages: {invalid_age_count}")

        # Check salary validity
        invalid_salaries = (df['salary'] < 0) | (df['salary'] > 10000000)
        invalid_salary_count = invalid_salaries.sum()

        if invalid_salary_count > 0:
            validity_issues.append(f"Invalid salaries: {invalid_salary_count}")

        # Check categorical values
        valid_categories = ['Electronics', 'Clothing', 'Books', 'Home']
        invalid_categories = ~df['product_category'].isin(valid_categories)
        invalid_category_count = invalid_categories.sum()

        if invalid_category_count > 0:
            validity_issues.append(f"Invalid product categories: {invalid_category_count}")

        if validity_issues:
            self.issues_found.extend(validity_issues)
            self.rules_failed += len(validity_issues)
        else:
            self.rules_passed += 4

        return validity_issues

    def check_consistency(self, df):
        """Check data consistency"""
        print("🔍 Checking data consistency...")

        consistency_issues = []

        # Check for logical inconsistencies
        # Age vs salary consistency (young people with very high salaries)
        young_high_salary = df[(df['age'] < 25) & (df['salary'] > 200000)]
        if len(young_high_salary) > 0:
            consistency_issues.append(f"Young people with high salaries: {len(young_high_salary)}")

        # Purchase amount vs quantity consistency
        high_amount_low_quantity = df[(df['purchase_amount'] > 500) & (df['quantity'] == 1)]
        if len(high_amount_low_quantity) > len(df) * 0.1:
            consistency_issues.append(f"High amount purchases with single quantity: {len(high_amount_low_quantity)}")

        if consistency_issues:
            self.issues_found.extend(consistency_issues)
            self.rules_failed += len(consistency_issues)
        else:
            self.rules_passed += 2

        return consistency_issues

    def detect_outliers(self, df):
        """Detect outliers using statistical methods"""
        print("🔍 Detecting outliers...")

        outlier_report = {}

        # Numerical columns to check
        numerical_cols = ['age', 'salary', 'purchase_amount', 'quantity']

        for col in numerical_cols:
            if col in df.columns:
                # IQR method
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                outlier_count = len(outliers)

                if outlier_count > 0:
                    outlier_report[col] = {
                        'count': outlier_count,
                        'percentage': (outlier_count / len(df)) * 100,
                        'lower_bound': lower_bound,
                        'upper_bound': upper_bound
                    }

        if outlier_report:
            self.issues_found.extend([
                f"Outliers in {col}: {info['count']} ({info['percentage']:.2f}%)"
                for col, info in outlier_report.items()
            ])
            self.rules_failed += len(outlier_report)
        else:
            self.rules_passed += len(numerical_cols)

        return outlier_report

    def calculate_quality_score(self):
        """Calculate overall data quality score"""
        total_rules = self.rules