In [None]:
import random
import warnings
from collections import Counter, defaultdict
from copy import deepcopy

# Third-Party Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# TensorFlow / Keras
from tensorflow.keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import (
    Add,
    BatchNormalization,
    Dense,
    Dropout,
    Embedding,
    GlobalAveragePooling1D,
    Input,
    LayerNormalization,
    LSTM,
    MultiHeadAttention
)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# ============================================================================
# 1. LABEL MAPPING SYSTEM
# ============================================================================

class LabelMapper:
    """Advanced label mapping system for consolidating user groups"""
    
    def __init__(self):
        self.label_mapping = {}
        self.reverse_mapping = {}
        
    def create_mapping(self, mapping_dict):
        """Create mapping from original labels to consolidated labels"""
        self.label_mapping = {}
        self.reverse_mapping = mapping_dict.copy()
        
        for consolidated_label, original_labels in mapping_dict.items():
            for original_label in original_labels:
                self.label_mapping[original_label] = consolidated_label
        
        print("📋 LABEL MAPPING CREATED")
        print("=" * 30)
        for consolidated, originals in mapping_dict.items():
            print(f"🏷️ {consolidated}:")
            for original in originals:
                print(f"   • {original}")
        
        return self.label_mapping
    
    def map_labels(self, original_labels):
        """Map list of original labels to consolidated labels"""
        mapped_labels = []
        unmapped_count = 0
        
        for label in original_labels:
            if label in self.label_mapping:
                mapped_labels.append(self.label_mapping[label])
            else:
                mapped_labels.append(label)
                unmapped_count += 1
        
        if unmapped_count > 0:
            print(f"⚠️ Warning: {unmapped_count} labels were not mapped")
        
        return mapped_labels
    
    def analyze_distribution(self, original_labels, mapped_labels):
        """Analyze label distribution before and after mapping"""
        original_counts = Counter(original_labels)
        mapped_counts = Counter(mapped_labels)
        
        print("\n📊 LABEL DISTRIBUTION ANALYSIS")
        print("=" * 40)
        
        print("\n🔍 Original distribution:")
        total = len(original_labels)
        for label, count in sorted(original_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total) * 100
            print(f"  • {label}: {count:,} ({percentage:.1f}%)")
        
        print("\n🎯 Consolidated distribution:")
        for label, count in sorted(mapped_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total) * 100
            print(f"  • {label}: {count:,} ({percentage:.1f}%)")
        
        print(f"\n📈 Summary:")
        print(f"  • Classes: {len(original_counts)} → {len(mapped_counts)}")
        print(f"  • Reduction: {len(original_counts) - len(mapped_counts)} classes")
        
        return original_counts, mapped_counts

# Predefined label mappings
SUGGESTED_MAPPINGS = {
    'human_vs_bot': {
        'human': [
            'tax1_users', 'tax2_users', 'profilic_survey_users',
            'survey_desktop', 'hlisa_traces', 'za_proxy'
        ],
        'bot': [
            'browser_use_agents', 'skyvern_agents', 'random_mouse_with_sleep_bot',
            'random_mouse_bot', 'gremlins'
        ]
    },
    'granular': {
        'human': ['tax1_users', 'tax2_users', 'profilic_survey_users', 'survey_desktop'],
        'hlisa_traces': ['hlisa_traces'],
        'za_proxy': ['za_proxy'],
        'browser_use_agents': ['browser_use_agents'],
        'skyvern_agents': ['skyvern_agents'],
        'gremlins': ['gremlins'],
        'random_bots': ['random_mouse_with_sleep_bot', 'random_mouse_bot']
    },
    'default': {
        'tax1_users': ['tax1_users'],
        'tax2_users': ['tax2_users'],
        'profilic_survey_users': ['profilic_survey_users'],
        'survey_desktop': ['survey_desktop'],
        'hlisa_traces': ['hlisa_traces'],
        'za_proxy': ['za_proxy'],
        'browser_use_agents': ['browser_use_agents'],
        'skyvern_agents': ['skyvern_agents'],
        'gremlins': ['gremlins'],
        'random_bots': ['random_mouse_with_sleep_bot', 'random_mouse_bot']
    }
}

print("✅ Label Mapping System Ready!")

# ============================================================================
# 2. DATA AUGMENTATION AND BALANCING SYSTEM
# ============================================================================

class InteractionDataAugmenter:
    """Advanced data augmentation for user interaction sequences"""
    
    def __init__(self, random_seed=42):
        self.random_seed = random_seed
        random.seed(random_seed)
        np.random.seed(random_seed)
    
    def temporal_jitter(self, subsession, jitter_std=0.1):
        """Add small random variations to timing while preserving order"""
        augmented = deepcopy(subsession)
        
        if len(augmented) < 2:
            return augmented
        
        base_times = [event['timestamp_relative'] for event in augmented]
        max_time = max(base_times) if base_times else 1.0
        
        for i, event in enumerate(augmented):
            if i > 0:
                jitter = np.random.normal(0, jitter_std * max_time)
                min_time = augmented[i-1]['timestamp_relative'] + 0.001
                event['timestamp_relative'] = max(min_time, event['timestamp_relative'] + jitter)
        
        # Recalculate time_since_last
        for i in range(1, len(augmented)):
            augmented[i]['time_since_last'] = (
                augmented[i]['timestamp_relative'] - augmented[i-1]['timestamp_relative']
            )
        
        return augmented
    
    def spatial_jitter(self, subsession, jitter_std=10):
        """Add small random variations to mouse positions"""
        augmented = deepcopy(subsession)
        
        for event in augmented:
            if event['x_pos'] > 0 or event['y_pos'] > 0:
                event['x_pos'] += np.random.normal(0, jitter_std)
                event['y_pos'] += np.random.normal(0, jitter_std)
                event['x_pos'] = max(0, event['x_pos'])
                event['y_pos'] = max(0, event['y_pos'])
        
        return augmented
    
    def event_type_substitution(self, subsession, substitution_prob=0.1):
        """Randomly substitute similar event types"""
        similar_events = {
            'click': ['mousedown', 'mouseup'],
            'mousedown': ['click'],
            'mouseup': ['click'],
            'mousemove': ['mouseover', 'mouseout'],
            'mouseover': ['mousemove'],
            'mouseout': ['mousemove'],
            'keydown': ['keyup', 'keypress'],
            'keyup': ['keydown', 'keypress'],
            'keypress': ['keydown', 'keyup']
        }
        
        augmented = deepcopy(subsession)
        
        for event in augmented:
            if random.random() < substitution_prob:
                current_type = event['event_type']
                if current_type in similar_events:
                    possible_substitutes = similar_events[current_type]
                    if possible_substitutes:
                        event['event_type'] = random.choice(possible_substitutes)
        
        return augmented
    
    def sequence_cropping(self, subsession, crop_ratio_range=(0.7, 0.9)):
        """Randomly crop sequence while maintaining temporal structure"""
        if len(subsession) < 3:
            return subsession
        
        crop_ratio = random.uniform(*crop_ratio_range)
        new_length = max(2, int(len(subsession) * crop_ratio))
        start_idx = random.randint(0, len(subsession) - new_length)
        cropped = subsession[start_idx:start_idx + new_length]
        
        if cropped:
            start_time = cropped[0]['timestamp_relative']
            for event in cropped:
                event['timestamp_relative'] -= start_time
        
        return cropped
    
    def speed_variation(self, subsession, speed_factor_range=(0.8, 1.2)):
        """Vary overall speed of interactions"""
        augmented = deepcopy(subsession)
        speed_factor = random.uniform(*speed_factor_range)
        
        for event in augmented:
            event['timestamp_relative'] *= speed_factor
            event['time_since_last'] *= speed_factor
        
        return augmented
    
    def generate_synthetic_subsession(self, reference_subsessions, augmentation_strength=0.3):
        """Generate synthetic subsession using multiple augmentation techniques"""
        if not reference_subsessions:
            return None
        
        base_subsession = random.choice(reference_subsessions)
        synthetic = deepcopy(base_subsession)
        
        # Apply augmentations with probability based on strength
        if random.random() < augmentation_strength:
            synthetic = self.temporal_jitter(synthetic, jitter_std=0.05 * augmentation_strength)
        
        if random.random() < augmentation_strength:
            synthetic = self.spatial_jitter(synthetic, jitter_std=5 * augmentation_strength)
        
        if random.random() < augmentation_strength * 0.5:
            synthetic = self.event_type_substitution(synthetic, substitution_prob=0.05 * augmentation_strength)
        
        if random.random() < augmentation_strength * 0.7:
            synthetic = self.sequence_cropping(synthetic)
        
        if random.random() < augmentation_strength:
            synthetic = self.speed_variation(synthetic)
        
        return synthetic

class DataBalancer:
    """Balance class distribution by generating synthetic samples"""
    
    def __init__(self, target_balance_strategy='oversample_to_max', 
                 max_synthetic_ratio=2.0, augmentation_strength=0.3):
        self.target_balance_strategy = target_balance_strategy
        self.max_synthetic_ratio = max_synthetic_ratio
        self.augmentation_strength = augmentation_strength
        self.augmenter = InteractionDataAugmenter()
    
    def analyze_class_distribution(self, subsessions, labels):
        """Analyze current class distribution"""
        label_counts = Counter(labels)
        total_samples = len(labels)
        
        print("📊 CLASS DISTRIBUTION ANALYSIS")
        print("=" * 40)
        print(f"Total samples: {total_samples:,}")
        print(f"Number of classes: {len(label_counts)}")
        
        sorted_classes = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
        
        for label, count in sorted_classes:
            percentage = (count / total_samples) * 100
            print(f"  • {label}: {count:,} ({percentage:.1f}%)")
        
        max_count = max(label_counts.values())
        min_count = min(label_counts.values())
        imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')
        
        print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")
        
        return label_counts, imbalance_ratio
    
    def calculate_target_counts(self, label_counts):
        """Calculate target sample counts for each class"""
        if self.target_balance_strategy == 'oversample_to_max':
            target_count = max(label_counts.values())
        elif self.target_balance_strategy == 'oversample_to_mean':
            target_count = int(np.mean(list(label_counts.values())))
        else:
            target_count = int(np.median(list(label_counts.values())))
        
        target_counts = {}
        for label, current_count in label_counts.items():
            max_allowed = int(current_count * (1 + self.max_synthetic_ratio))
            target_counts[label] = min(target_count, max_allowed)
        
        return target_counts
    
    def balance_data(self, subsessions, labels, max_size_after_balancing=20000):
        """Balance dataset by generating synthetic samples"""
        print("\n🎯 BALANCING DATASET")
        print("=" * 30)
        
        label_counts, imbalance_ratio = self.analyze_class_distribution(subsessions, labels)
        
        if imbalance_ratio < 2.0:
            print("✓ Dataset is already reasonably balanced")
            return subsessions, labels
        
        target_counts = self.calculate_target_counts(label_counts)
        
        # Group subsessions by label
        subsessions_by_label = {}
        for subsession, label in zip(subsessions, labels):
            if label not in subsessions_by_label:
                subsessions_by_label[label] = []
            subsessions_by_label[label].append(subsession)
        
        # Generate synthetic samples
        balanced_subsessions = []
        balanced_labels = []
        total_synthetic = 0
        
        for label, target_count in target_counts.items():
            current_subsessions = subsessions_by_label[label]
            current_count = len(current_subsessions)
            target_count = min(target_count, max_size_after_balancing)
            needed = target_count - current_count
            
            # Add original samples

            if current_count > max_size_after_balancing:
                balanced_subsessions.extend(current_subsessions[:max_size_after_balancing])
                balanced_labels.extend([label] * max_size_after_balancing)
            else:
                balanced_subsessions.extend(current_subsessions)
                balanced_labels.extend([label] * current_count)
            
            # Generate synthetic samples if needed
            if needed > 0:
                print(f"Generating {needed:,} synthetic samples for {label}...")
                
                for _ in range(needed):
                    synthetic_subsession = self.augmenter.generate_synthetic_subsession(
                        current_subsessions, self.augmentation_strength
                    )
                    
                    if synthetic_subsession:
                        balanced_subsessions.append(synthetic_subsession)
                        balanced_labels.append(label)
                        total_synthetic += 1
        
        print(f"\n✅ Balancing completed!")
        print(f"  • Original: {len(subsessions):,}")
        print(f"  • Synthetic: {total_synthetic:,}")
        print(f"  • Total: {len(balanced_subsessions):,}")
        
        # Shuffle the balanced dataset
        combined = list(zip(balanced_subsessions, balanced_labels))
        random.shuffle(combined)
        balanced_subsessions, balanced_labels = zip(*combined)
        
        return list(balanced_subsessions), list(balanced_labels)

print("✅ Data Augmentation and Balancing System Ready!")

# ============================================================================
# 3. TRAINING MONITORING AND DIAGNOSTICS
# ============================================================================

class TrainingMonitor(Callback):
    """Advanced training monitor with issue detection"""
    
    def __init__(self, model_type="Model"):
        super().__init__()
        self.model_type = model_type
        self.batch_losses = []
        self.epoch_metrics = []
        
    def on_batch_end(self, batch, logs=None):
        self.batch_losses.append(logs.get('loss', 0))
        
    def on_epoch_end(self, epoch, logs=None):
        metrics = {
            'epoch': epoch + 1,
            'loss': logs.get('loss', 0),
            'val_loss': logs.get('val_loss', 0),
            'accuracy': logs.get('accuracy', 0),
            'val_accuracy': logs.get('val_accuracy', 0)
        }
        self.epoch_metrics.append(metrics)
        
        print(f"\n{self.model_type} Epoch {epoch + 1}:")
        print(f"  Loss: {metrics['loss']:.4f} → Val Loss: {metrics['val_loss']:.4f}")
        print(f"  Acc: {metrics['accuracy']:.4f} → Val Acc: {metrics['val_accuracy']:.4f}")
        
        # Issue detection
        if epoch > 5:
            recent_losses = self.batch_losses[-100:]
            if len(recent_losses) > 10:
                loss_std = np.std(recent_losses)
                if loss_std < 0.001:
                    print("  ⚠️ Loss plateau detected - consider reducing learning rate")
                elif loss_std > 1.0:
                    print("  ⚠️ Loss instability - learning rate might be too high")
        
        # Check for improvement
        if epoch > 3:
            recent_val_acc = [m['val_accuracy'] for m in self.epoch_metrics[-3:]]
            if all(acc <= recent_val_acc[0] + 0.001 for acc in recent_val_acc):
                print("  📊 No improvement in validation accuracy for 3 epochs")

def diagnose_model_issues(model, X_sample, y_sample, model_type="Model"):
    """Comprehensive model diagnostics"""
    print(f"🔍 DIAGNOSING {model_type.upper()} ISSUES")
    print("=" * 50)
    
    # Input data analysis
    print("\n📊 Input Data Analysis:")
    print(f"  • Shape: {X_sample.shape}")
    print(f"  • Range: [{X_sample.min():.3f}, {X_sample.max():.3f}]")
    print(f"  • Mean: {X_sample.mean():.3f}, Std: {X_sample.std():.3f}")
    
    if X_sample.std() > 100:
        print("  ⚠️ Large input variance - consider normalization")
    
    # Model architecture
    print(f"\n🏗️ Model Architecture:")
    total_params = model.count_params()
    print(f"  • Parameters: {total_params:,}")
    
    if total_params > 1000000:
        print("  ⚠️ Large model - risk of overfitting")
    elif total_params < 10000:
        print("  ⚠️ Small model - risk of underfitting")
    
    # Forward pass test
    print(f"\n🔄 Forward Pass Test:")
    try:
        predictions = model(X_sample[:1])
        print(f"  • Output shape: {predictions.shape}")
        print(f"  • Output range: [{predictions.numpy().min():.3f}, {predictions.numpy().max():.3f}]")
        
        if tf.math.reduce_any(tf.math.is_nan(predictions)):
            print("  ❌ NaN outputs detected!")
        else:
            print("  ✅ Forward pass successful")
            
    except Exception as e:
        print(f"  ❌ Forward pass failed: {e}")
    
    # Gradient flow test
    print(f"\n📈 Gradient Flow Test:")
    try:
        with tf.GradientTape() as tape:
            predictions = model(X_sample[:32])
            loss = tf.keras.losses.categorical_crossentropy(y_sample[:32], predictions)
            loss = tf.reduce_mean(loss)
        
        gradients = tape.gradient(loss, model.trainable_variables)
        grad_norms = [tf.norm(grad).numpy() for grad in gradients if grad is not None]
        
        if grad_norms:
            print(f"  • Gradient norms: min={min(grad_norms):.6f}, max={max(grad_norms):.6f}")
            
            if max(grad_norms) < 1e-6:
                print("  ⚠️ Vanishing gradients detected")
            elif max(grad_norms) > 10:
                print("  ⚠️ Exploding gradients detected")
            else:
                print("  ✅ Gradient flow looks healthy")
        
    except Exception as e:
        print(f"  ❌ Gradient computation failed: {e}")

print("✅ Training Monitoring and Diagnostics Ready!")

# ============================================================================
# 4. BASE PIPELINE CLASS
# ============================================================================

class BasePipeline:
    """Base class with common functionality for user classification pipelines"""
    
    def __init__(self, session_gap_minutes=5, subsession_duration_seconds=30, 
                 min_events_per_subsession=10, max_sequence_length=100):
        self.session_gap_minutes = session_gap_minutes
        self.subsession_duration_seconds = subsession_duration_seconds
        self.min_events_per_subsession = min_events_per_subsession
        self.max_sequence_length = max_sequence_length
        
        # Encoders and scalers
        self.event_encoder = LabelEncoder()
        self.label_encoder = LabelEncoder()
        self.feature_scaler = StandardScaler()
        
        # Model and training
        self.model = None
        self.history = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
        # Label mapping
        self.label_mapper = None
        self.original_labels = None
    
    def segment_into_sessions(self, events_df, user_id):
        """Break user events into sessions based on inactivity gaps"""
        user_events = events_df[events_df['userId'] == user_id].copy()
        
        if len(user_events) == 0:
            return []
        
        user_events = user_events.sort_values('timestamp_num').copy()
        user_events['time_diff_minutes'] = user_events['timestamp_num'].diff() / (1000 * 60)
        
        session_breaks = user_events['time_diff_minutes'] > self.session_gap_minutes
        user_events['session_id'] = session_breaks.cumsum()
        
        sessions = []
        for session_id in user_events['session_id'].unique():
            session_events = user_events[user_events['session_id'] == session_id].copy()
            if len(session_events) >= 2:
                sessions.append(session_events)
        
        return sessions
    
    def create_subsessions(self, session_events):
        """Create fixed-duration subsessions from a session"""
        if len(session_events) == 0:
            return []
        
        session_events = session_events.sort_values('timestamp_num').copy()
        
        start_time = session_events['timestamp_num'].min()
        end_time = session_events['timestamp_num'].max()
        session_duration = (end_time - start_time) / 1000
        
        subsession_duration_ms = self.subsession_duration_seconds * 1000
        num_subsessions = max(1, int(session_duration / self.subsession_duration_seconds))
        
        subsessions = []
        
        for i in range(num_subsessions):
            subsession_start = start_time + (i * subsession_duration_ms)
            subsession_end = subsession_start + subsession_duration_ms
            
            subsession_mask = (
                (session_events['timestamp_num'] >= subsession_start) & 
                (session_events['timestamp_num'] < subsession_end)
            )
            
            subsession_events = session_events[subsession_mask].copy()
            
            if len(subsession_events) >= self.min_events_per_subsession:
                subsessions.append(subsession_events)
        
        return subsessions
    
    def extract_features_from_subsession(self, subsession_events):
        """Extract features from a subsession"""
        if len(subsession_events) == 0:
            return None
        
        subsession_events = subsession_events.sort_values('timestamp_num').copy()
        features = []
        
        for idx, event in subsession_events.iterrows():
            event_features = {
                'event_type': event.get('eventName', 'unknown'),
                'timestamp_relative': 0,
                'x_pos': 0,
                'y_pos': 0,
                'time_since_last': 0
            }
            
            # Extract position if available
            if 'pos' in event and event['pos']:
                try:
                    import json
                    pos_data = json.loads(event['pos']) if isinstance(event['pos'], str) else event['pos']
                    event_features['x_pos'] = pos_data.get('x', 0)
                    event_features['y_pos'] = pos_data.get('y', 0)
                except:
                    pass
            
            features.append(event_features)
        
        # Calculate relative timestamps and time differences
        start_time = subsession_events['timestamp_num'].min()
        prev_time = start_time
        
        for i, feature in enumerate(features):
            current_time = subsession_events.iloc[i]['timestamp_num']
            feature['timestamp_relative'] = (current_time - start_time) / 1000
            feature['time_since_last'] = (current_time - prev_time) / 1000
            prev_time = current_time
        
        return features
    
    def process_with_label_mapping(self, mixed_sample_events, label_mapping=None):
        """Process mixed sample events with optional label mapping"""
        print("🔄 PROCESSING EVENTS WITH LABEL MAPPING")
        print("=" * 45)
        
        all_subsessions = []
        all_labels = []
        
        for label, data in mixed_sample_events.items():
            print(f"\n📊 Processing {label}...")
            
            events_df = data['events_df']
            user_count = 0
            subsession_count = 0
            
            for user_id in events_df['userId'].unique():
                user_count += 1
                sessions = self.segment_into_sessions(events_df, user_id)
                
                for session in sessions:
                    subsessions = self.create_subsessions(session)
                    
                    for subsession in subsessions:
                        features = self.extract_features_from_subsession(subsession)
                        
                        if features and len(features) >= self.min_events_per_subsession:
                            all_subsessions.append(features)
                            all_labels.append(label)
                            subsession_count += 1
            
            print(f"  ✓ {user_count} users → {subsession_count} subsessions")
        
        # Apply label mapping if provided
        if label_mapping:
            print(f"\n🏷️ APPLYING LABEL MAPPING")
            print("=" * 30)
            
            mapper = LabelMapper()
            mapper.create_mapping(label_mapping)
            
            original_labels = all_labels.copy()
            mapped_labels = mapper.map_labels(all_labels)
            
            mapper.analyze_distribution(original_labels, mapped_labels)
            
            self.label_mapper = mapper
            self.original_labels = original_labels
            
            return all_subsessions, mapped_labels
        
        return all_subsessions, all_labels
    
    def encode_subsessions(self, subsessions, labels):
        """Encode subsessions into numerical format"""
        print("\n🔢 ENCODING SUBSESSIONS")
        print("=" * 25)
        
        # Collect all event types
        all_event_types = set()
        for subsession in subsessions:
            for event in subsession:
                all_event_types.add(event['event_type'])
        
        self.event_encoder.fit(list(all_event_types))
        self.label_encoder.fit(labels)
        
        print(f"  • Event types: {len(all_event_types)}")
        print(f"  • Labels: {len(set(labels))}")
        
        # Encode subsessions
        encoded_sequences = []
        
        for subsession in subsessions:
            limited_subsession = subsession[:self.max_sequence_length]
            
            sequence = []
            for event in limited_subsession:
                encoded_event = [
                    self.event_encoder.transform([event['event_type']])[0],
                    event['timestamp_relative'],
                    event['x_pos'],
                    event['y_pos'],
                    event['time_since_last']
                ]
                sequence.append(encoded_event)
            
            encoded_sequences.append(sequence)
        
        # Pad sequences
        max_len = min(self.max_sequence_length, max(len(seq) for seq in encoded_sequences))
        X = np.zeros((len(encoded_sequences), max_len, 5))
        
        for i, sequence in enumerate(encoded_sequences):
            seq_len = min(len(sequence), max_len)
            X[i, :seq_len, :] = sequence[:seq_len]
        
        # Encode labels
        y = self.label_encoder.transform(labels)
        y = to_categorical(y)
        
        print(f"  • Encoded shape: {X.shape}")
        print(f"  • Labels shape: {y.shape}")
        
        return X, y
    
    def preprocess_features(self, X):
        """Preprocess features for better training"""
        print("🔄 PREPROCESSING FEATURES")
        print("=" * 30)
        
        X_processed = X.copy()
        
        # Feature-wise normalization
        for feature_idx in range(X.shape[2]):
            feature_data = X[:, :, feature_idx]
            
            if feature_idx == 0:  # Event type (categorical)
                continue
            else:  # Continuous features
                non_zero_data = feature_data[feature_data != 0]
                if len(non_zero_data) > 0:
                    p5, p95 = np.percentile(non_zero_data, [5, 95])
                    if p95 > p5:
                        feature_data = np.clip(feature_data, p5, p95)
                        feature_data = (feature_data - p5) / (p95 - p5)
                        X_processed[:, :, feature_idx] = feature_data
        
        print(f"  • Original range: [{X.min():.3f}, {X.max():.3f}]")
        print(f"  • Processed range: [{X_processed.min():.3f}, {X_processed.max():.3f}]")
        
        return X_processed
    
    def evaluate_model(self):
        """Evaluate the trained model"""
        print("\n📊 MODEL EVALUATION")
        print("=" * 25)
        
        y_pred = self.model.predict(self.X_test)
        y_pred_classes = y_pred.argmax(axis=1)
        y_true_classes = self.y_test.argmax(axis=1)
        
        class_names = self.label_encoder.classes_
        print("\nClassification Report:")
        print(classification_report(y_true_classes, y_pred_classes, target_names=class_names))
        
        # Confusion matrix
        cm = confusion_matrix(y_true_classes, y_pred_classes)
        print(cm)
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=class_names, yticklabels=class_names)
        plt.title(f'{self.__class__.__name__} - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.show()
        
        # Training history
        if self.history:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            
            ax1.plot(self.history.history['accuracy'], label='Training')
            ax1.plot(self.history.history['val_accuracy'], label='Validation')
            ax1.set_title('Model Accuracy')
            ax1.set_xlabel('Epoch')
            ax1.set_ylabel('Accuracy')
            ax1.legend()
            
            ax2.plot(self.history.history['loss'], label='Training')
            ax2.plot(self.history.history['val_loss'], label='Validation')
            ax2.set_title('Model Loss')
            ax2.set_xlabel('Epoch')
            ax2.set_ylabel('Loss')
            ax2.legend()
            
            plt.tight_layout()
            plt.show()

print("✅ Base Pipeline Class Ready!")

# ============================================================================
# 5. ADVANCED LSTM PIPELINE
# ============================================================================

class LSTMPipeline(BasePipeline):
    """Advanced LSTM pipeline with optimized architecture and training"""
    
    def __init__(self, session_gap_minutes=5, subsession_duration_seconds=30, 
                 min_events_per_subsession=10, max_sequence_length=100):
        super().__init__(session_gap_minutes, subsession_duration_seconds,
                        min_events_per_subsession, max_sequence_length)
    
    def build_lstm_model(self, input_shape, num_classes):
        """Build optimized LSTM model"""
        print("\n🏗️ BUILDING ADVANCED LSTM MODEL")
        print("=" * 40)
        
        model = Sequential([
            # Input normalization
            BatchNormalization(input_shape=input_shape),
            
            # First LSTM layer
            LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # Second LSTM layer
            LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # Dense layers with regularization
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(num_classes, activation='softmax')
        ])
        
        # Conservative optimizer
        optimizer = Adam(
            learning_rate=0.001,
            beta_1=0.9,
            beta_2=0.999,
            clipnorm=1.0
        )
        
        model.compile(
            optimizer=optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        print(f"  • Input shape: {input_shape}")
        print(f"  • Output classes: {num_classes}")
        print(f"  • Total parameters: {model.count_params():,}")
        print(f"  • Regularization: Dropout + BatchNorm + Gradient clipping")
        
        self.model = model
        return model
    
    def train_model(self, X, y, validation_split=0.2, epochs=6, batch_size=32):
        """Train LSTM model with enhanced monitoring"""
        print("\n🚀 TRAINING ADVANCED LSTM MODEL")
        print("=" * 40)
        
        # Preprocess features
        X_processed = self.preprocess_features(X)
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_processed, y, test_size=validation_split, random_state=42, 
            stratify=y.argmax(axis=1)
        )
        
        print(f"  • Training samples: {len(self.X_train):,}")
        print(f"  • Testing samples: {len(self.X_test):,}")
        
        # Callbacks
        monitor = TrainingMonitor("LSTM")
        callbacks = [
            monitor,
            EarlyStopping(
                patience=12,
                restore_best_weights=True,
                monitor='val_accuracy',
                mode='max'
            ),
            ReduceLROnPlateau(
                patience=6,
                factor=0.5,
                min_lr=1e-6,
                monitor='val_accuracy',
                mode='max',
                verbose=1
            )
        ]
        
        # Train model
        self.history = self.model.fit(
            self.X_train, self.y_train,
            validation_data=(self.X_test, self.y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        return self.history
    
    def run_complete_pipeline(self, mixed_sample_events, enable_balancing=True, 
                            label_mapping=None, diagnose=True):
        """Run complete LSTM pipeline"""
        print("🚀 RUNNING COMPLETE ADVANCED LSTM PIPELINE")
        print("=" * 50)
        
        # Step 1: Process with label mapping
        subsessions, labels = self.process_with_label_mapping(mixed_sample_events, label_mapping)
        
        if len(subsessions) == 0:
            print("❌ No subsessions generated")
            return None
        
        # Step 2: Balance data
        if enable_balancing:
            balancer = DataBalancer(
                target_balance_strategy='oversample_to_max',
                max_synthetic_ratio=2.0,
                augmentation_strength=0.3
            )
            subsessions, labels = balancer.balance_data(subsessions, labels)
        
        # Step 3: Encode subsessions
        X, y = self.encode_subsessions(subsessions, labels)
        
        # Step 4: Build model
        input_shape = (X.shape[1], X.shape[2])
        num_classes = y.shape[1]
        self.build_lstm_model(input_shape, num_classes)
        
        # Step 5: Diagnose if requested
        if diagnose:
            diagnose_model_issues(self.model, X[:100], y[:100], "LSTM")
        
        # Step 6: Train model
        self.train_model(X, y)
        
        # Step 7: Evaluate model
        self.evaluate_model()
        
        print("\n✅ Advanced LSTM Pipeline completed!")
        return self.model
    
    # Convenience methods
    def run_with_human_vs_bot(self, mixed_sample_events, enable_balancing=True):
        """Run with human vs bot classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['human_vs_bot']
        )
    
    def run_with_granular_mapping(self, mixed_sample_events, enable_balancing=True):
        """Run with granular classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['granular']
        )
    
    def run_with_default_mapping(self, mixed_sample_events, enable_balancing=False):
        """Run with default classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['default']
        )
    
    def run_with_custom_mapping(self, mixed_sample_events, custom_mapping, enable_balancing=True):
        """Run with custom mapping"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, custom_mapping
        )

print("✅ Advanced LSTM Pipeline Ready!")

# ============================================================================
# 6. PIPELINE INITIALIZATION AND UTILITIES
# ============================================================================

def initialize_pipelines():
    """Initialize both LSTM and Transformer pipelines"""
    print("🚀 INITIALIZING ADVANCED PIPELINES")
    print("=" * 40)
    
    # LSTM Pipeline
    lstm_pipeline = LSTMPipeline(
        session_gap_minutes=5,
        subsession_duration_seconds=0.5,
        min_events_per_subsession=8,
        max_sequence_length=1000
    )
    
    print("✅ Pipeline initialized successfully!")
    return lstm_pipeline

def show_available_mappings():
    """Display all available predefined label mappings"""
    print("\n🎯 AVAILABLE LABEL MAPPINGS")
    print("=" * 35)
    
    for scenario_name, mapping in SUGGESTED_MAPPINGS.items():
        print(f"\n📋 {scenario_name.upper()}:")
        print("-" * 25)
        for consolidated_label, original_labels in mapping.items():
            print(f"🏷️ {consolidated_label}:")
            for original in original_labels:
                print(f"   • {original}")


# Initialize pipelines
lstm_pipeline = initialize_pipelines()

print("\n🎯 USAGE EXAMPLES:")
print("=" * 20)

examples = '''
# Example 1: Quick human vs bot classification with LSTM
lstm_model = lstm_pipeline.run_with_human_vs_bot(mixed_sample_events)

# Example 2: Transformer with custom mapping
custom_mapping = {
    'legitimate_users': ['tax1_users', 'tax2_users', 'survey_desktop'],
    'automation_bots': ['browser_use_agents', 'skyvern_agents'],
    'malicious_bots': ['random_mouse_bot', 'gremlins']
}
transformer_model = transformer_pipeline.run_with_custom_mapping(
    mixed_sample_events, custom_mapping
)

# Example 3: Compare both models
results = compare_models(
    lstm_pipeline, transformer_pipeline, mixed_sample_events, 
    SUGGESTED_MAPPINGS['human_vs_bot']
)

# Example 4: Show available mappings
show_available_mappings()

# Example 5: Full pipeline with all options
model = lstm_pipeline.run_complete_pipeline(
    mixed_sample_events,
    enable_balancing=True,
    label_mapping=SUGGESTED_MAPPINGS['granular'],
    diagnose=True
)
'''

print(examples)

print("🎉 ADVANCED USER CLASSIFICATION MODELS READY!")
print("=" * 50)
print("✅ Features included:")
print("  • Optimized LSTM and Transformer architectures")
print("  • Advanced data balancing and augmentation")
print("  • Flexible label mapping system")
print("  • Comprehensive training monitoring")
print("  • Built-in diagnostics and issue detection")
print("  • Easy-to-use convenience methods")
print("  • Model comparison utilities")
print("  • Complete data loading and processing pipeline")

# ============================================================================
# 7. EXAMPLE
# ============================================================================

print("\n🚀 Ready to classify user interactions with state-of-the-art models!")
print("\n📋 QUICK START OPTIONS:")
print("=" * 25)

# Train model for classification
# lstm_model = lstm_pipeline.run_with_granular_mapping(mixed_sample_events)

print(workflow)


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# TensorFlow/Keras imports
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, BatchNormalization, LayerNormalization,
    MultiHeadAttention, GlobalAveragePooling1D, Input, Add,
    Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.utils import to_categorical


class CNNPipeline(BasePipeline):
    """
    Alternative 1D CNN pipeline that tries Conv1D first, falls back to Dense layers
    """
    
    def __init__(self, session_gap_minutes=5, subsession_duration_seconds=1, 
                 min_events_per_subsession=10, max_sequence_length=100,
                 num_filters=[64, 128, 256], kernel_sizes=[3, 5, 7], 
                 pool_sizes=[2, 2, 2], dropout_rate=0.3):
        super().__init__(session_gap_minutes, subsession_duration_seconds,
                        min_events_per_subsession, max_sequence_length)
        
        # CNN hyperparameters
        self.num_filters = num_filters
        self.kernel_sizes = kernel_sizes
        self.pool_sizes = pool_sizes
        self.dropout_rate = dropout_rate
        self.use_conv1d = True  # Will be set to False if Conv1D fails
    
    def build_cnn_model(self, input_shape, num_classes):
        """Build 1D CNN model with fallback to dense layers"""
        print("\n🏗️ BUILDING 1D CNN MODEL WITH FALLBACK")
        print("=" * 45)
        
        try:
            # Try to build with Conv1D layers first
            print("  🔄 Attempting Conv1D layers...")
            model = self._build_conv1d_model(input_shape, num_classes)
            
            # Test if Conv1D works by doing a small forward pass
            test_input = tf.random.normal((1,) + input_shape)
            _ = model(test_input)
            
            print("  ✅ Conv1D layers working successfully!")
            self.use_conv1d = True
            
        except Exception as e:
            print(f"  ⚠️ Conv1D failed ({str(e)[:50]}...), falling back to Dense layers")
            model = self._build_dense_model(input_shape, num_classes)
            self.use_conv1d = False
        
        self.model = model
        return model
    
    def _build_conv1d_model(self, input_shape, num_classes):
        """Build model with Conv1D layers"""
        model = Sequential()
        
        # Input normalization
        model.add(BatchNormalization(input_shape=input_shape))
        
        # First CNN block
        model.add(Conv1D(
            filters=self.num_filters[0], 
            kernel_size=self.kernel_sizes[0],
            activation='relu',
            padding='same'
        ))
        model.add(BatchNormalization())
        model.add(MaxPooling1D(pool_size=self.pool_sizes[0]))
        model.add(Dropout(self.dropout_rate))
        
        # Second CNN block
        model.add(Conv1D(
            filters=self.num_filters[1],
            kernel_size=self.kernel_sizes[1],
            activation='relu',
            padding='same'
        ))
        model.add(BatchNormalization())
        model.add(MaxPooling1D(pool_size=self.pool_sizes[1]))
        model.add(Dropout(self.dropout_rate))
        
        # Global pooling
        model.add(GlobalMaxPooling1D())
        
        # Dense layers
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(self.dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(self.dropout_rate * 0.5))
        model.add(Dense(num_classes, activation='softmax'))
        
        model.compile(
            optimizer=Adam(learning_rate=0.001, clipnorm=1.0),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        print(f"  • Architecture: Conv1D layers")
        return model
    
    def _build_dense_model(self, input_shape, num_classes):
        """Build model with Dense layers (fallback)"""
        model = Sequential()
        
        # Flatten input
        model.add(Flatten(input_shape=input_shape))
        
        # Dense layers simulating CNN behavior
        model.add(Dense(256, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(self.dropout_rate))
        
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(self.dropout_rate))
        
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(self.dropout_rate))
        
        model.add(Dense(num_classes, activation='softmax'))
        
        model.compile(
            optimizer=Adam(learning_rate=0.001, clipnorm=1.0),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        print(f"  • Architecture: Dense layers (CPU fallback)")
        return model
    
    def train_model(self, X, y, validation_split=0.2, epochs=50, batch_size=32):
        """Train model with enhanced monitoring"""
        print(f"\n🚀 TRAINING {'CONV1D' if self.use_conv1d else 'DENSE'} MODEL")
        print("=" * 40)
        
        # Preprocess features
        X_processed = self.preprocess_features(X)
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_processed, y, test_size=validation_split, random_state=42, 
            stratify=y.argmax(axis=1)
        )
        
        print(f"  • Training samples: {len(self.X_train):,}")
        print(f"  • Testing samples: {len(self.X_test):,}")
        
        # Callbacks
        monitor = TrainingMonitor("Alternative CNN")
        callbacks = [
            monitor,
            EarlyStopping(
                patience=10,
                restore_best_weights=True,
                monitor='val_accuracy',
                mode='max'
            ),
            ReduceLROnPlateau(
                patience=5,
                factor=0.5,
                min_lr=1e-6,
                monitor='val_accuracy',
                mode='max',
                verbose=1
            )
        ]
        
        # Train model
        self.history = self.model.fit(
            self.X_train, self.y_train,
            validation_data=(self.X_test, self.y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        return self.history
    
    def run_complete_pipeline(self, mixed_sample_events, enable_balancing=True, 
                            label_mapping=None, diagnose=True):
        """Run complete alternative CNN pipeline"""
        print("🚀 RUNNING ALTERNATIVE 1D CNN PIPELINE")
        print("=" * 45)
        
        # Step 1: Process with label mapping
        subsessions, labels = self.process_with_label_mapping(mixed_sample_events, label_mapping)
        
        if len(subsessions) == 0:
            print("❌ No subsessions generated")
            return None
        
        # Step 2: Balance data
        if enable_balancing:
            balancer = DataBalancer(
                target_balance_strategy='oversample_to_max',
                max_synthetic_ratio=2.0,
                augmentation_strength=0.3
            )
            subsessions, labels = balancer.balance_data(subsessions, labels)
        
        # Step 3: Encode subsessions
        X, y = self.encode_subsessions(subsessions, labels)
        
        # Step 4: Build model
        input_shape = (X.shape[1], X.shape[2])
        num_classes = y.shape[1]
        self.build_cnn_model(input_shape, num_classes)
        
        # Step 5: Diagnose if requested
        if diagnose:
            diagnose_model_issues(self.model, X[:100], y[:100], "Alternative CNN")
        
        # Step 6: Train model
        self.train_model(X, y)
        
        # Step 7: Evaluate model
        self.evaluate_model()
        
        print(f"\n✅ CNN Pipeline completed using {'Conv1D' if self.use_conv1d else 'Dense'} layers!")
        return self.model
    
    # Convenience methods
    def run_with_human_vs_bot(self, mixed_sample_events, enable_balancing=True):
        """Run with human vs bot classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['human_vs_bot']
        )

    def run_with_granular_mapping(self, mixed_sample_events, enable_balancing=True):
        """Run with granular classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['granular']
        )
    
    def run_with_default_mapping(self, mixed_sample_events, enable_balancing=True):
        """Run with default classification"""
        return self.run_complete_pipeline(
            mixed_sample_events, enable_balancing, SUGGESTED_MAPPINGS['default']
        )

print("✅ 1D CNN Pipeline Ready!")

# cnn_pipeline = CNNPipeline(
#    session_gap_minutes=5,
#    subsession_duration_seconds=0.5,
#    min_events_per_subsession=10,
#    max_sequence_length=100
# )

# Train normally - it will handle the DNN library issue automatically
# cnn_model = cnn_pipeline.run_with_default_mapping(mixed_sample_events)
