In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.multioutput import MultiOutputClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
import warnings
warnings.filterwarnings('ignore')

In [None]:
class EnhancedMobileUserCategorizer:
    def __init__(self):
        # Enhanced categories with clear definitions
        self.categories = {
            'Professional/Business User': {
                'definition': 'Users who primarily use phones for work, productivity, and business communications',
                'characteristics': ['High productivity app usage', 'Long battery life priority', 'Premium build quality']
            },
            'Student/Academic User': {
                'definition': 'Students and academic users focusing on learning, research, and social connectivity',
                'characteristics': ['Educational app usage', 'Budget-conscious', 'Social media engagement']
            },
            'Mobile Gamer': {
                'definition': 'Users who prioritize gaming performance and spend significant time playing mobile games',
                'characteristics': ['High performance requirements', 'Gaming-focused features', 'Fast refresh rates']
            },
            'Content Creator': {
                'definition': 'Users who create digital content including photos, videos, and social media posts',
                'characteristics': ['Superior camera quality', 'Video editing capabilities', 'Storage requirements']
            },
            'Entertainment Enthusiast': {
                'definition': 'Users focused on media consumption including videos, music, and streaming content',
                'characteristics': ['Media consumption priority', 'Display and audio quality', 'Streaming optimization']
            },
            'Tech Enthusiast': {
                'definition': 'Early adopters who want cutting-edge technology and latest features',
                'characteristics': ['Latest specifications', 'Innovation adoption', 'Customization options']
            },
            'Budget-Conscious User': {
                'definition': 'Users seeking essential functionality at affordable prices with good value',
                'characteristics': ['Price-sensitive', 'Essential features focus', 'Long-term reliability']
            },
            'Senior/Accessibility User': {
                'definition': 'Users who prioritize ease of use, accessibility, and simple interfaces',
                'characteristics': ['Simple interface needs', 'Accessibility features', 'Reliability focus']
            },
            'Social Media Influencer': {
                'definition': 'Users who actively create and share content across multiple social platforms',
                'characteristics': ['Multi-platform sharing', 'High-quality cameras', 'Aesthetic design preference']
            }
        }
        
        # Enhanced question set (exactly 10 questions) with better categorization power
        self.questions = {
            'Budget Range': {
                'question': 'What is your budget range for a mobile phone?',
                'options': ['₹8k-₹15k', '₹15k-₹25k', '₹25k-₹40k', '₹40k-₹60k', '₹60k+'],
                'rationale': 'Budget is a primary differentiator between categories like Budget-Conscious vs Tech Enthusiast'
            },
            'Primary Usage': {
                'question': 'What is your primary phone usage pattern?',
                'options': ['Work & Business', 'Gaming & Entertainment', 'Content Creation', 'Social Media & Communication', 'Basic Communication'],
                'rationale': 'Directly identifies the main use case and primary category alignment'
            },
            'Daily Screen Time': {
                'question': 'How much time do you spend on your phone daily?',
                'options': ['1-2 hours', '2-4 hours', '4-6 hours', '6-8 hours', '8+ hours'],
                'rationale': 'Heavy users have different needs than light users, affects battery and performance requirements'
            },
            'App Categories Used Most': {
                'question': 'Which app categories do you use most frequently?',
                'options': ['Productivity & Work', 'Games & Entertainment', 'Photo & Video Editing', 'Social Media & Messaging', 'Basic Apps Only'],
                'rationale': 'App usage patterns strongly correlate with user categories and feature requirements'
            },
            'Performance Priority': {
                'question': 'How important is high performance (speed, multitasking) to you?',
                'options': ['Critical - Need flagship performance', 'Very Important - Above average performance', 'Moderately Important', 'Not Very Important', 'Don\'t Care - Basic is fine'],
                'rationale': 'Separates performance-demanding users (Gamers, Professionals) from basic users'
            },
            'Camera Importance': {
                'question': 'How important is camera quality and features to you?',
                'options': ['Professional Level - Multiple lenses, Pro modes', 'Very Important - High quality photos/videos', 'Important - Good everyday photos', 'Moderate - Casual photography', 'Basic - Just functional'],
                'rationale': 'Critical for identifying Content Creators and Social Media Influencers vs other categories'
            },
            'Battery & Charging Needs': {
                'question': 'What are your battery and charging requirements?',
                'options': ['2+ days battery, wireless charging', 'Full day heavy use, fast charging', 'Full day moderate use', 'Half day is sufficient', 'Few hours is enough'],
                'rationale': 'Heavy users and professionals need different battery solutions than casual users'
            },
            'Feature Preferences': {
                'question': 'Which features matter most to you?',
                'options': ['Latest tech & innovation', 'Gaming features (high refresh, cooling)', 'Content tools (pro camera, editing)', 'Simple & accessible interface', 'Value for money features'],
                'rationale': 'Directly maps to specific user categories and their unique requirements'
            },
            'Design & Build Priority': {
                'question': 'What matters most in phone design and build?',
                'options': ['Premium materials & flagship design', 'Gaming aesthetics & performance build', 'Sleek design for content creation', 'Simple, easy-to-use design', 'Practical & durable build'],
                'rationale': 'Different categories have distinct design preferences and priorities'
            },
            'Purchase Decision Factor': {
                'question': 'What is the most important factor in your phone buying decision?',
                'options': ['Cutting-edge technology & specs', 'Gaming performance & features', 'Camera quality & content tools', 'Ease of use & reliability', 'Best value for money'],
                'rationale': 'Final decision factors reveal core user priorities and category alignment'
            }
        }
        
        self.category_names = list(self.categories.keys())
        self.label_encoders = {}
        self.model = None
        self.feature_importance = None
        
    def generate_enhanced_synthetic_data(self, n_samples=3000):
        """Generate enhanced synthetic data with better category representation"""
        data = []
        
        # Enhanced profiles with more nuanced patterns
        profiles = {
            'Professional/Business User': {
                'Budget Range': ['₹25k-₹40k', '₹40k-₹60k', '₹60k+'],
                'Primary Usage': ['Work & Business'],
                'Daily Screen Time': ['2-4 hours', '4-6 hours'],
                'App Categories Used Most': ['Productivity & Work'],
                'Performance Priority': ['Critical - Need flagship performance', 'Very Important - Above average performance'],
                'Camera Importance': ['Important - Good everyday photos', 'Very Important - High quality photos/videos'],
                'Battery & Charging Needs': ['2+ days battery, wireless charging', 'Full day heavy use, fast charging'],
                'Feature Preferences': ['Latest tech & innovation', 'Value for money features'],
                'Design & Build Priority': ['Premium materials & flagship design', 'Practical & durable build'],
                'Purchase Decision Factor': ['Cutting-edge technology & specs', 'Ease of use & reliability']
            },
            'Student/Academic User': {
                'Budget Range': ['₹8k-₹15k', '₹15k-₹25k'],
                'Primary Usage': ['Social Media & Communication', 'Basic Communication'],
                'Daily Screen Time': ['4-6 hours', '6-8 hours'],
                'App Categories Used Most': ['Social Media & Messaging', 'Basic Apps Only'],
                'Performance Priority': ['Moderately Important', 'Not Very Important'],
                'Camera Importance': ['Important - Good everyday photos', 'Moderate - Casual photography'],
                'Battery & Charging Needs': ['Full day moderate use', 'Full day heavy use, fast charging'],
                'Feature Preferences': ['Value for money features'],
                'Design & Build Priority': ['Practical & durable build', 'Sleek design for content creation'],
                'Purchase Decision Factor': ['Best value for money']
            },
            'Mobile Gamer': {
                'Budget Range': ['₹25k-₹40k', '₹40k-₹60k', '₹60k+'],
                'Primary Usage': ['Gaming & Entertainment'],
                'Daily Screen Time': ['6-8 hours', '8+ hours'],
                'App Categories Used Most': ['Games & Entertainment'],
                'Performance Priority': ['Critical - Need flagship performance'],
                'Camera Importance': ['Moderate - Casual photography', 'Basic - Just functional'],
                'Battery & Charging Needs': ['Full day heavy use, fast charging'],
                'Feature Preferences': ['Gaming features (high refresh, cooling)'],
                'Design & Build Priority': ['Gaming aesthetics & performance build'],
                'Purchase Decision Factor': ['Gaming performance & features']
            },
            'Content Creator': {
                'Budget Range': ['₹25k-₹40k', '₹40k-₹60k', '₹60k+'],
                'Primary Usage': ['Content Creation'],
                'Daily Screen Time': ['4-6 hours', '6-8 hours', '8+ hours'],
                'App Categories Used Most': ['Photo & Video Editing'],
                'Performance Priority': ['Critical - Need flagship performance', 'Very Important - Above average performance'],
                'Camera Importance': ['Professional Level - Multiple lenses, Pro modes', 'Very Important - High quality photos/videos'],
                'Battery & Charging Needs': ['Full day heavy use, fast charging', '2+ days battery, wireless charging'],
                'Feature Preferences': ['Content tools (pro camera, editing)'],
                'Design & Build Priority': ['Sleek design for content creation', 'Premium materials & flagship design'],
                'Purchase Decision Factor': ['Camera quality & content tools']
            },
            'Entertainment Enthusiast': {
                'Budget Range': ['₹15k-₹25k', '₹25k-₹40k'],
                'Primary Usage': ['Gaming & Entertainment', 'Social Media & Communication'],
                'Daily Screen Time': ['6-8 hours', '8+ hours'],
                'App Categories Used Most': ['Games & Entertainment', 'Social Media & Messaging'],
                'Performance Priority': ['Very Important - Above average performance', 'Moderately Important'],
                'Camera Importance': ['Important - Good everyday photos', 'Moderate - Casual photography'],
                'Battery & Charging Needs': ['Full day heavy use, fast charging', 'Full day moderate use'],
                'Feature Preferences': ['Value for money features'],
                'Design & Build Priority': ['Sleek design for content creation', 'Practical & durable build'],
                'Purchase Decision Factor': ['Best value for money']
            },
            'Tech Enthusiast': {
                'Budget Range': ['₹40k-₹60k', '₹60k+'],
                'Primary Usage': ['Work & Business', 'Gaming & Entertainment'],
                'Daily Screen Time': ['2-4 hours', '4-6 hours', '6-8 hours'],
                'App Categories Used Most': ['Productivity & Work', 'Games & Entertainment'],
                'Performance Priority': ['Critical - Need flagship performance'],
                'Camera Importance': ['Professional Level - Multiple lenses, Pro modes', 'Very Important - High quality photos/videos'],
                'Battery & Charging Needs': ['2+ days battery, wireless charging', 'Full day heavy use, fast charging'],
                'Feature Preferences': ['Latest tech & innovation'],
                'Design & Build Priority': ['Premium materials & flagship design'],
                'Purchase Decision Factor': ['Cutting-edge technology & specs']
            },
            'Budget-Conscious User': {
                'Budget Range': ['₹8k-₹15k'],
                'Primary Usage': ['Basic Communication', 'Social Media & Communication'],
                'Daily Screen Time': ['1-2 hours', '2-4 hours'],
                'App Categories Used Most': ['Basic Apps Only', 'Social Media & Messaging'],
                'Performance Priority': ['Not Very Important', 'Don\'t Care - Basic is fine'],
                'Camera Importance': ['Basic - Just functional', 'Moderate - Casual photography'],
                'Battery & Charging Needs': ['Full day moderate use', 'Half day is sufficient'],
                'Feature Preferences': ['Value for money features'],
                'Design & Build Priority': ['Practical & durable build'],
                'Purchase Decision Factor': ['Best value for money']
            },
            'Senior/Accessibility User': {
                'Budget Range': ['₹8k-₹15k', '₹15k-₹25k'],
                'Primary Usage': ['Basic Communication'],
                'Daily Screen Time': ['1-2 hours', '2-4 hours'],
                'App Categories Used Most': ['Basic Apps Only'],
                'Performance Priority': ['Not Very Important', 'Don\'t Care - Basic is fine'],
                'Camera Importance': ['Basic - Just functional', 'Moderate - Casual photography'],
                'Battery & Charging Needs': ['2+ days battery, wireless charging', 'Full day moderate use'],
                'Feature Preferences': ['Simple & accessible interface'],
                'Design & Build Priority': ['Simple, easy-to-use design'],
                'Purchase Decision Factor': ['Ease of use & reliability']
            },
            'Social Media Influencer': {
                'Budget Range': ['₹25k-₹40k', '₹40k-₹60k'],
                'Primary Usage': ['Content Creation', 'Social Media & Communication'],
                'Daily Screen Time': ['6-8 hours', '8+ hours'],
                'App Categories Used Most': ['Photo & Video Editing', 'Social Media & Messaging'],
                'Performance Priority': ['Very Important - Above average performance', 'Critical - Need flagship performance'],
                'Camera Importance': ['Professional Level - Multiple lenses, Pro modes', 'Very Important - High quality photos/videos'],
                'Battery & Charging Needs': ['Full day heavy use, fast charging'],
                'Feature Preferences': ['Content tools (pro camera, editing)'],
                'Design & Build Priority': ['Sleek design for content creation', 'Premium materials & flagship design'],
                'Purchase Decision Factor': ['Camera quality & content tools']
            }
        }
        
        # Generate samples with more realistic distribution
        samples_per_category = n_samples // len(self.category_names)
        
        for category in self.category_names:
            for _ in range(samples_per_category):
                sample = {}
                profile = profiles[category]
                
                for question_key, question_info in self.questions.items():
                    options = question_info['options']
                    if question_key in profile:
                        # Higher probability for profile-specific options
                        if random.random() < 0.85:  # 85% adherence to profile
                            sample[question_key] = random.choice(profile[question_key])
                        else:
                            sample[question_key] = random.choice(options)
                    else:
                        sample[question_key] = random.choice(options)
                
                sample['Primary Category'] = category
                
                # Add secondary category for hybrid users
                if random.random() < 0.25:  # 25% chance of hybrid
                    compatible_categories = self._get_compatible_categories(category)
                    if compatible_categories:
                        sample['Secondary Category'] = random.choice(compatible_categories)
                    else:
                        sample['Secondary Category'] = 'None'
                else:
                    sample['Secondary Category'] = 'None'
                
                data.append(sample)
        
        return pd.DataFrame(data)
    
    def _get_compatible_categories(self, primary_category):
        """Get compatible secondary categories for hybrid users"""
        compatibility_map = {
            'Professional/Business User': ['Tech Enthusiast', 'Content Creator'],
            'Student/Academic User': ['Social Media Influencer', 'Entertainment Enthusiast'],
            'Mobile Gamer': ['Tech Enthusiast', 'Entertainment Enthusiast'],
            'Content Creator': ['Social Media Influencer', 'Tech Enthusiast'],
            'Entertainment Enthusiast': ['Mobile Gamer', 'Social Media Influencer'],
            'Tech Enthusiast': ['Professional/Business User', 'Mobile Gamer'],
            'Budget-Conscious User': ['Student/Academic User'],
            'Senior/Accessibility User': [],
            'Social Media Influencer': ['Content Creator', 'Entertainment Enthusiast']
        }
        return compatibility_map.get(primary_category, [])
    
    def preprocess_data(self, df):
        """Enhanced preprocessing with feature engineering"""
        df_encoded = df.copy()
        
        # Encode categorical features
        for question_key in self.questions.keys():
            if question_key not in self.label_encoders:
                self.label_encoders[question_key] = LabelEncoder()
                df_encoded[question_key] = self.label_encoders[question_key].fit_transform(df[question_key])
            else:
                df_encoded[question_key] = self.label_encoders[question_key].transform(df[question_key])
        
        return df_encoded
    
    def train_enhanced_model(self, df):
        """Train enhanced model with better algorithms and validation"""
        print("Training Enhanced ML Model...")
        print("="*50)
        
        # Preprocess data
        df_encoded = self.preprocess_data(df)
        
        # Prepare features and targets
        X = df_encoded[list(self.questions.keys())]
        
        # Create multi-label targets
        y_primary = pd.get_dummies(df['Primary Category'])
        y_secondary = pd.get_dummies(df['Secondary Category'])
        
        # Combine for multi-label approach
        y_combined = y_primary.copy()
        for category in self.category_names:
            if category in y_secondary.columns:
                y_combined[category] = y_combined[category] | y_secondary[category]
        
        # Ensure all categories are present
        for category in self.category_names:
            if category not in y_combined.columns:
                y_combined[category] = 0
        
        y_combined = y_combined[self.category_names]  # Reorder columns
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_combined, test_size=0.2, random_state=42, stratify=df['Primary Category']
        )
        
        # Train with Gradient Boosting for better performance
        self.model = MultiOutputClassifier(
            GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
        )
        self.model.fit(X_train, y_train)
        
        # Calculate feature importance
        importances = []
        for estimator in self.model.estimators_:
            importances.append(estimator.feature_importances_)
        self.feature_importance = np.mean(importances, axis=0)
        
        # Model evaluation
        y_pred = self.model.predict(X_test)
        
        # Calculate detailed metrics
        metrics = self._calculate_metrics(y_test, y_pred)
        
        print("Model Performance Metrics:")
        print("-" * 30)
        for category, metric in metrics.items():
            print(f"{category}:")
            print(f"  Accuracy: {metric['accuracy']:.3f}")
            print(f"  Precision: {metric['precision']:.3f}")
            print(f"  Recall: {metric['recall']:.3f}")
            print(f"  F1-Score: {metric['f1']:.3f}")
        
        return X_test, y_test, y_pred, metrics
    
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate detailed metrics for each category"""
        metrics = {}
        
        for i, category in enumerate(self.category_names):
            y_true_cat = y_true.iloc[:, i] if hasattr(y_true, 'iloc') else y_true[:, i]
            y_pred_cat = y_pred[:, i]
            
            # Avoid division by zero
            precision = accuracy_score(y_true_cat, y_pred_cat)
            recall = accuracy_score(y_true_cat, y_pred_cat)  # For binary classification
            f1 = f1_score(y_true_cat, y_pred_cat, zero_division=0)
            accuracy = accuracy_score(y_true_cat, y_pred_cat)
            
            metrics[category] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
        
        return metrics
    
    def create_visualizations(self, df, metrics, X_test, y_test, y_pred):
        """Create 4 key visualizations with enhanced clarity"""
        plt.style.use('default')
        plt.rcParams['font.size'] = 11
        plt.rcParams['axes.labelsize'] = 12
        plt.rcParams['axes.titlesize'] = 14
        plt.rcParams['legend.fontsize'] = 10
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Mobile User Categorization - Key Insights Dashboard', fontsize=18, fontweight='bold', y=0.98)
        
        # 1. FEATURE IMPORTANCE - Horizontal Bar Chart
        ax1 = axes[0, 0]
        if self.feature_importance is not None:
            # Create cleaner question names
            question_names = []
            for key in self.questions.keys():
                if len(key) > 15:
                    words = key.split()
                    if len(words) >= 2:
                        question_names.append(f"{words[0]} {words[1]}")
                    else:
                        question_names.append(key[:15])
                else:
                    question_names.append(key)
            
            sorted_idx = np.argsort(self.feature_importance)
            y_pos = np.arange(len(question_names))
            
            # Use gradient colors for better visual appeal
            colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(sorted_idx)))
            bars = ax1.barh(y_pos, self.feature_importance[sorted_idx], color=colors[sorted_idx])
            
            ax1.set_yticks(y_pos)
            ax1.set_yticklabels(np.array(question_names)[sorted_idx])
            ax1.set_xlabel('Importance Score')
            ax1.set_title('Question Importance in Categorization', fontweight='bold', pad=20)
            ax1.grid(axis='x', alpha=0.3, linestyle='--')
            
            # Add value labels on bars
            for i, bar in enumerate(bars):
                width = bar.get_width()
                ax1.text(width + 0.005, bar.get_y() + bar.get_height()/2, 
                        f'{width:.3f}', ha='left', va='center', fontweight='bold', fontsize=9)
        
        # 2. MODEL PERFORMANCE METRICS - Clean Bar Chart with Error Bars
        ax2 = axes[0, 1]
        
        # Shorten category names for x-axis
        categories_short = []
        for cat in self.category_names:
            if '/' in cat:
                parts = cat.split('/')
                categories_short.append(f"{parts[0]}")
            else:
                categories_short.append(cat.split()[0])
        
        accuracy_scores = [metrics[cat]['accuracy'] for cat in self.category_names]
        f1_scores = [metrics[cat]['f1'] for cat in self.category_names]
        
        x = np.arange(len(categories_short))
        width = 0.35
        
        bars1 = ax2.bar(x - width/2, accuracy_scores, width, label='Accuracy', 
                       color='#2E86AB', alpha=0.8, edgecolor='black', linewidth=0.5)
        bars2 = ax2.bar(x + width/2, f1_scores, width, label='F1-Score', 
                       color='#A23B72', alpha=0.8, edgecolor='black', linewidth=0.5)
        
        ax2.set_xlabel('User Categories')
        ax2.set_ylabel('Performance Score')
        ax2.set_title('Model Performance by Category', fontweight='bold', pad=20)
        ax2.set_xticks(x)
        ax2.set_xticklabels(categories_short, rotation=45, ha='right')
        ax2.legend(loc='upper right')
        ax2.set_ylim(0, 1.1)
        ax2.grid(axis='y', alpha=0.3, linestyle='--')
        
        # Add value labels on bars
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax2.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                        f'{height:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
        
        # 3. OVERALL MODEL SUMMARY - Professional Dashboard Style
        ax3 = axes[1, 0]
        
        overall_metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
        overall_scores = [
            np.mean([metrics[cat]['accuracy'] for cat in self.category_names]),
            np.mean([metrics[cat]['precision'] for cat in self.category_names]),
            np.mean([metrics[cat]['recall'] for cat in self.category_names]),
            np.mean([metrics[cat]['f1'] for cat in self.category_names])
        ]
        
        # Create gradient colors
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
        bars = ax3.bar(overall_metrics, overall_scores, color=colors, 
                      edgecolor='black', linewidth=1, alpha=0.8)
        
        ax3.set_ylabel('Score')
        ax3.set_title('Overall Model Performance Summary', fontweight='bold', pad=20)
        ax3.set_ylim(0, 1.1)
        ax3.grid(axis='y', alpha=0.3, linestyle='--')
        
        # Add value labels with background
        for bar, score in zip(bars, overall_scores):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2, height + 0.02,
                    f'{score:.3f}', ha='center', va='bottom', 
                    fontweight='bold', fontsize=12,
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))
        
        # 4. DATA INSIGHTS SUMMARY - Text-based Summary
        ax4 = axes[1, 1]
        ax4.axis('off')
        
        # Calculate key insights
        total_users = len(df)
        hybrid_users = (df['Secondary Category'] != 'None').sum()
        hybrid_percentage = (hybrid_users / total_users) * 100
        most_common_category = df['Primary Category'].mode().iloc[0]
        avg_accuracy = np.mean([metrics[cat]['accuracy'] for cat in self.category_names])
        best_performing_category = max(self.category_names, key=lambda x: metrics[x]['f1'])
        
        # Create summary text
        summary_text = f"""
          KEY INSIGHTS

          Dataset Overview:
          • Total Users Analyzed: {total_users:,}
          • Hybrid Users: {hybrid_users} ({hybrid_percentage:.1f}%)
          • Categories: {len(self.category_names)}

          Model Performance:
          • Average Accuracy: {avg_accuracy:.1%}
          • Best Performing: {best_performing_category.split('/')[0]}
          • Questions Used: {len(self.questions)}

          Top Category:
          {most_common_category}

          Model Type:
          Gradient Boosting with
          Multi-Label Classification
        """
        
        ax4.text(0.05, 0.95, summary_text, transform=ax4.transAxes, fontsize=11,
                verticalalignment='top', fontfamily='monospace',
                bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
        
        # Adjust layout to prevent overlap
        plt.tight_layout(rect=[0, 0.02, 1, 0.95])
        plt.subplots_adjust(hspace=0.3, wspace=0.3)
        plt.show()
    
    def predict_user_category(self, user_responses):
        """Enhanced prediction with confidence scores"""
        if self.model is None:
            raise ValueError("Model not trained yet!")
        
        # Create DataFrame from user responses
        user_df = pd.DataFrame([user_responses])
        
        # Encode responses
        user_encoded = user_df.copy()
        for question_key in self.questions.keys():
            if user_responses[question_key] not in self.label_encoders[question_key].classes_:
                # Handle unseen categories
                print(f"Warning: '{user_responses[question_key]}' not seen during training for {question_key}")
                # Use the most common category
                user_encoded[question_key] = [0]
            else:
                user_encoded[question_key] = self.label_encoders[question_key].transform([user_responses[question_key]])
        
        # Make prediction
        prediction_proba = self.model.predict_proba(user_encoded[list(self.questions.keys())])
        prediction_binary = self.model.predict(user_encoded[list(self.questions.keys())])
        
        # Get predicted categories with confidence scores
        results = []
        for i, category in enumerate(self.category_names):
            if prediction_binary[0][i] == 1:
                # Get probability from the corresponding classifier
                if len(prediction_proba[i][0]) > 1:
                    confidence = prediction_proba[i][0][1]  # Probability of positive class
                else:
                    confidence = prediction_proba[i][0][0]
                results.append((category, confidence))
        
        # If no categories predicted, get top 2 by probability
        if not results:
            all_probs = []
            for i, category in enumerate(self.category_names):
                if len(prediction_proba[i][0]) > 1:
                    prob = prediction_proba[i][0][1]
                else:
                    prob = prediction_proba[i][0][0]
                all_probs.append((category, prob))
            
            # Sort by probability and take top 2
            all_probs.sort(key=lambda x: x[1], reverse=True)
            results = all_probs[:2]
        
        # Sort results by confidence and return top 2
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:2] if len(results) >= 2 else results
    
    def display_enhanced_user_profile(self, user_responses, categories, show_rationale=True):
        """Display enhanced user profile with detailed analysis"""
        print("\n" + "="*80)
        print("ENHANCED USER PROFILE ANALYSIS")
        print("="*80)
        
        # Display user responses with question context
        print("\nDetailed Response Analysis:")
        print("-" * 50)
        for question_key, answer in user_responses.items():
            question_info = self.questions[question_key]
            print(f"\n{question_info['question']}")
            print(f"Your Answer: {answer}")
            if show_rationale:
                print(f"Analysis: {question_info['rationale']}")
        
        # Display predicted categories with definitions
        print(f"\nPredicted User Categories:")
        print("-" * 50)
        for i, (category, confidence) in enumerate(categories, 1):
            print(f"\n{i}. {category} (Confidence: {confidence:.2f})")
            print(f"   Definition: {self.categories[category]['definition']}")
            print(f"   Key Characteristics: {', '.join(self.categories[category]['characteristics'])}")
        
        # Provide detailed recommendations
        self._provide_detailed_recommendations(categories, user_responses)
        
        # Risk assessment and alternative suggestions
        self._provide_risk_assessment(user_responses)
    
    def _provide_detailed_recommendations(self, categories, user_responses):
        """Provide detailed phone recommendations based on categories"""
        print(f"\nPersonalized Phone Recommendations:")
        print("-" * 50)
        
        if len(categories) > 0:
            primary_category = categories[0][0]
            confidence = categories[0][1]
            
            # Budget-based recommendations
            budget = user_responses.get('Budget Range', '')
            print(f"Based on your budget ({budget}) and primary category ({primary_category}):")
            
            if "Professional" in primary_category:
                print("\n📱 PROFESSIONAL RECOMMENDATIONS:")
                print("• Samsung Galaxy S series or iPhone Pro models")
                print("• Focus on: Enterprise security, productivity apps, premium build")
                print("• Key features: Long battery life, wireless charging, premium materials")
                print("• Consider: Business-grade security features and dual-SIM capability")
                
            elif "Gamer" in primary_category:
                print("\n🎮 GAMING RECOMMENDATIONS:")
                print("• ASUS ROG Phone, RedMagic series, or iPhone Pro Max")
                print("• Focus on: High refresh rate (120Hz+), gaming triggers, cooling")
                print("• Key features: Powerful chipset, large battery, fast charging")
                print("• Consider: Gaming accessories and dedicated gaming modes")
                
            elif "Content Creator" in primary_category:
                print("\n📸 CONTENT CREATOR RECOMMENDATIONS:")
                print("• iPhone Pro series, Samsung Galaxy S Ultra, or Google Pixel Pro")
                print("• Focus on: Multiple camera lenses, video recording, editing capabilities")
                print("• Key features: Pro camera modes, high storage, color accuracy")
                print("• Consider: Gimbal support and professional video features")
                
            elif "Social Media Influencer" in primary_category:
                print("\n🌟 INFLUENCER RECOMMENDATIONS:")
                print("• iPhone Pro, Samsung Galaxy S series, or Vivo V series")
                print("• Focus on: Front camera quality, aesthetic design, social media optimization")
                print("• Key features: Portrait modes, beauty filters, sleek design")
                print("• Consider: Color options and social media integration")
                
            elif "Tech Enthusiast" in primary_category:
                print("\n⚡ TECH ENTHUSIAST RECOMMENDATIONS:")
                print("• Latest flagship models: iPhone Pro Max, Samsung Galaxy S Ultra")
                print("• Focus on: Cutting-edge features, latest processors, innovation")
                print("• Key features: 5G, wireless charging, premium materials")
                print("• Consider: Early access to new features and beta programs")
                
            elif "Entertainment" in primary_category:
                print("\n🎬 ENTERTAINMENT RECOMMENDATIONS:")
                print("• OnePlus series, Samsung Galaxy A series, or iPhone standard models")
                print("• Focus on: Large display, good speakers, streaming optimization")
                print("• Key features: High-quality display, Dolby Atmos, long battery")
                print("• Consider: Subscription services and media consumption features")
                
            elif "Budget" in primary_category:
                print("\n💰 BUDGET RECOMMENDATIONS:")
                print("• Xiaomi Redmi series, Realme series, or Samsung Galaxy M series")
                print("• Focus on: Value for money, essential features, reliability")
                print("• Key features: Decent performance, adequate camera, good battery")
                print("• Consider: Long-term software support and build quality")
                
            elif "Senior" in primary_category:
                print("\n👴 SENIOR/ACCESSIBILITY RECOMMENDATIONS:")
                print("• iPhone SE, Samsung Galaxy A series with Easy Mode, or GrandPad")
                print("• Focus on: Simple interface, large text, accessibility features")
                print("• Key features: Emergency features, hearing aid compatibility, durability")
                print("• Consider: Family sharing features and remote assistance options")
                
            else:  # Student/Academic
                print("\n🎓 STUDENT RECOMMENDATIONS:")
                print("• Xiaomi series, Samsung Galaxy A series, or iPhone standard models")
                print("• Focus on: Balanced features, educational app support, durability")
                print("• Key features: Good camera for notes, social media capabilities")
                print("• Consider: Student discounts and educational app ecosystems")
            
            # Secondary category influence
            if len(categories) > 1:
                secondary_category = categories[1][0]
                print(f"\n🔄 HYBRID CONSIDERATION ({secondary_category}):")
                print(f"• Your secondary category suggests additional focus on {secondary_category.lower()} features")
                print("• Look for phones that balance both primary and secondary needs")
                print("• Consider flagship models that excel in multiple areas")
    
    def _provide_risk_assessment(self, user_responses):
        """Provide risk assessment and alternative suggestions"""
        print(f"\n⚠️ Risk Assessment & Alternative Considerations:")
        print("-" * 50)
        
        budget = user_responses.get('Budget Range', '')
        usage = user_responses.get('Primary Usage', '')
        performance = user_responses.get('Performance Priority', '')
        
        # Budget vs performance mismatch
        if 'Critical' in performance and '₹8k-₹15k' in budget:
            print("🚨 BUDGET-PERFORMANCE MISMATCH:")
            print("• Your performance needs may not be fully met within your budget")
            print("• Consider: Refurbished flagships or last-gen premium models")
            print("• Alternative: Increase budget or adjust performance expectations")
        
        # Battery vs usage mismatch
        screen_time = user_responses.get('Daily Screen Time', '')
        battery_need = user_responses.get('Battery & Charging Needs', '')
        
        if '8+ hours' in screen_time and 'Half day' in battery_need:
            print("🔋 USAGE-BATTERY MISMATCH:")
            print("• Your screen time suggests higher battery requirements")
            print("• Consider: Power banks or phones with larger batteries")
            print("• Alternative: Fast charging capabilities for frequent top-ups")
        
        # Feature vs budget alignment
        camera_importance = user_responses.get('Camera Importance', '')
        if 'Professional Level' in camera_importance and budget in ['₹8k-₹15k', '₹15k-₹25k']:
            print("📸 CAMERA-BUDGET MISMATCH:")
            print("• Professional camera features typically require higher budget")
            print("• Consider: Mid-range phones with excellent cameras (Pixel A series)")
            print("• Alternative: External lenses or photo editing apps")

In [None]:
def main():
    """Main function with enhanced workflow"""
    print("🚀 ENHANCED MOBILE USER CATEGORIZATION SYSTEM")
    print("="*70)
    
    # Initialize the enhanced categorizer
    categorizer = EnhancedMobileUserCategorizer()
    
    # Display category definitions
    print("\n📋 USER CATEGORY DEFINITIONS:")
    print("="*50)
    for i, (category, info) in enumerate(categorizer.categories.items(), 1):
        print(f"\n{i}. {category}")
        print(f"   {info['definition']}")
    
    # Display question rationale
    print(f"\n❓ QUESTION DESIGN RATIONALE:")
    print("="*50)
    print("The 10 questions were carefully selected based on the following principles:")
    print("• Budget Range: Primary constraint affecting all other choices")
    print("• Primary Usage & App Categories: Direct indicators of user behavior patterns")
    print("• Performance & Camera Importance: Key differentiators between user types")
    print("• Daily Screen Time: Correlates with battery and performance needs")
    print("• Feature Preferences: Reveals specific category alignment")
    print("• Design & Purchase Factors: Final decision-making preferences")
    print("\nEach question contributes unique information for accurate categorization.")
    
    # Generate enhanced synthetic data
    print(f"\n🔧 GENERATING TRAINING DATA...")
    synthetic_data = categorizer.generate_enhanced_synthetic_data(3000)
    print(f"Generated {len(synthetic_data)} training samples with {len(categorizer.category_names)} categories")
    
    # Display data overview
    print(f"\nTraining Data Overview:")
    print(f"• Primary Categories: {synthetic_data['Primary Category'].nunique()}")
    print(f"• Hybrid Users: {(synthetic_data['Secondary Category'] != 'None').sum()}")
    print(f"• Most Common Category: {synthetic_data['Primary Category'].mode().iloc[0]}")
    
    # Train the enhanced model
    print(f"\n🤖 TRAINING MACHINE LEARNING MODEL...")
    X_test, y_test, y_pred, metrics = categorizer.train_enhanced_model(synthetic_data)
    
    # Create comprehensive visualizations
    print(f"\n📊 GENERATING VISUALIZATIONS...")
    categorizer.create_visualizations(synthetic_data, metrics, X_test, y_test, y_pred)
    
    # Enhanced static user tests
    static_users = [
        {
            'Budget Range': '₹40k-₹60k',
            'Primary Usage': 'Work & Business', 
            'Daily Screen Time': '4-6 hours',
            'App Categories Used Most': 'Productivity & Work',
            'Performance Priority': 'Critical - Need flagship performance',
            'Camera Importance': 'Important - Good everyday photos',
            'Battery & Charging Needs': '2+ days battery, wireless charging',
            'Feature Preferences': 'Latest tech & innovation',
            'Design & Build Priority': 'Premium materials & flagship design',
            'Purchase Decision Factor': 'Cutting-edge technology & specs'
        },
        {
            'Budget Range': '₹25k-₹40k',
            'Primary Usage': 'Content Creation',
            'Daily Screen Time': '6-8 hours', 
            'App Categories Used Most': 'Photo & Video Editing',
            'Performance Priority': 'Very Important - Above average performance',
            'Camera Importance': 'Professional Level - Multiple lenses, Pro modes',
            'Battery & Charging Needs': 'Full day heavy use, fast charging',
            'Feature Preferences': 'Content tools (pro camera, editing)',
            'Design & Build Priority': 'Sleek design for content creation',
            'Purchase Decision Factor': 'Camera quality & content tools'
        }
    ]
    
    # Test with enhanced static users
    print("\n" + "="*80)
    print("🧪 TESTING WITH ENHANCED STATIC USER PROFILES")
    print("="*80)
    
    for i, user_responses in enumerate(static_users, 1):
        print(f"\n{'='*25} STATIC USER {i} {'='*25}")
        predicted_categories = categorizer.predict_user_category(user_responses)
        categorizer.display_enhanced_user_profile(user_responses, predicted_categories)
    
    # Interactive enhanced user input
    print("\n" + "="*80)
    print("🎯 INTERACTIVE ENHANCED USER CATEGORIZATION")
    print("="*80)
    print("Answer the following 10 carefully designed questions for accurate categorization:")
    
    user_responses = {}
    for question_key, question_info in categorizer.questions.items():
        print(f"\n" + "="*60)
        print(f"Question: {question_info['question']}")
        print(f"Context: {question_info['rationale']}")
        print("-" * 40)
        
        options = question_info['options']
        for i, option in enumerate(options, 1):
            print(f"  {i}. {option}")
        
        while True:
            try:
                choice = int(input(f"\nEnter your choice (1-{len(options)}): "))
                if 1 <= choice <= len(options):
                    user_responses[question_key] = options[choice-1]
                    print(f"✅ Selected: {options[choice-1]}")
                    break
                else:
                    print("❌ Invalid choice. Please try again.")
            except ValueError:
                print("❌ Please enter a valid number.")
    
    # Enhanced prediction for interactive user
    print(f"\n🔍 ANALYZING YOUR RESPONSES...")
    predicted_categories = categorizer.predict_user_category(user_responses)
    categorizer.display_enhanced_user_profile(user_responses, predicted_categories, show_rationale=False)
    
    # Final summary
    print("\n" + "="*80)
    print("✨ CATEGORIZATION COMPLETE!")
    print("="*80)
    
    if predicted_categories:
        primary_cat = predicted_categories[0][0]
        confidence = predicted_categories[0][1]
        print(f"🎯 Your Primary Category: {primary_cat}")
        print(f"📈 Confidence Level: {confidence:.1%}")
        
        if len(predicted_categories) > 1:
            secondary_cat = predicted_categories[1][0]
            secondary_conf = predicted_categories[1][1]
            print(f"🔄 Secondary Category: {secondary_cat}")
            print(f"📊 Secondary Confidence: {secondary_conf:.1%}")
    
    print(f"\n🙏 Thank you for using the Enhanced Mobile User Categorization System!")
    print("📱 Use these insights to find your perfect mobile phone match!")


if __name__ == "__main__":
    main()