In [None]:
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output, State
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import warnings
import os
import io
import re
from urllib.parse import urlparse
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import threading
import webbrowser
import time
from datetime import datetime, timedelta
import json
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.metrics import silhouette_score
# Disable specific Matplotlib warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Glyph 0x21d2 not found")
warnings.filterwarnings("ignore", category=FutureWarning, message="The default of observed=False is deprecated")

# --- Class and Function Definitions ---

class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def extract_website_company(self, row):
        """Maps id_site to a website company name. This is a crucial step to distinguish the 'website company' being analyzed
        from the 'visitor company' (ultimate_parent_name)."""
        mapping = {
            2: 'Company A',
            3: 'Company B',
            4: 'Company C',
            5: 'Company D',
            7: 'Company E'
        }
        return mapping.get(row['id_site'], 'Unknown')
            
    def load_and_preprocess_data(self):
        """Loads and preprocesses the raw traffic data."""
        try:
            df_all = pd.read_csv(self.file_path)
        except FileNotFoundError:
            print(f"Error: File not found at {self.file_path}.")
            return pd.DataFrame()
            
        # 1. Core Data Cleaning and renaming columns
        df_all = df_all.rename(columns=lambda x: x.strip())
        df_all.columns = [col.replace('#', '').strip() for col in df_all.columns]
            
        df_all['section'] = df_all['section'].astype(str).str.strip()
        df_all['sub-section'] = df_all['sub-section'].astype(str).str.strip()
            
        # Filter out rows with empty or 'nan' sections/subsections which are invalid for analysis
        df_all = df_all[df_all['section'] != '']
        df_all = df_all[df_all['sub-section'] != '']
        df_all = df_all[df_all['section'].str.lower() != 'nan']
        df_all = df_all[df_all['sub-section'].str.lower() != 'nan']
        # Also remove 'Other' section as per report code
        df_all = df_all[df_all['section'] != 'Other']


        # 2. Filter out invalid data and duplicates to ensure data quality
        # Only check columns that actually have variance
        core_event_identifiers = ['id_visit', 'url', 'timestamp', 'download_flag', 'time_spent']
        df_all = df_all.drop_duplicates(subset=core_event_identifiers, keep='first')
            
        # Convert timestamp and clean
        df_all['timestamp'] = pd.to_datetime(df_all['timestamp'], errors='coerce')
        df_all = df_all.dropna(subset=['timestamp']) # Drop rows where timestamp couldn't be parsed
        df_all = df_all.sort_values(['id_visit', 'timestamp'])
            

        # 3. Add company information from the 'ultimate_parent_name' column
        df_all['company'] = df_all['ultimate_parent_name'].fillna('Unknown')
            
        # NEW LOGIC: Extract the website company based on id_site mapping
        df_all['website_company'] = df_all.apply(self.extract_website_company, axis=1)

        # --- INTEGRATE NEW VISITOR SESSION IDENTIFICATION FROM REPORT CODE ---
        # Ensure visitor_id is numeric and handle NaNs early for robust analysis
        df_all['visitor_id'] = pd.to_numeric(df_all['visitor_id'], errors='coerce')
            
        # Crucially, filter out rows where visitor_id is NaN *before* grouping by visitor_id
        df_filtered_visitors = df_all.dropna(subset=['visitor_id']).copy()

        if df_filtered_visitors.empty:
            print("Warning: No valid visitor_ids found after filtering. 'is_new_visitor_session' will be all False.")
            df_all['is_new_visitor_session'] = False
        else:
            # 1. Find the earliest visit timestamp for each visitor_id (indexed by visitor_id)
            first_visit_time_per_visitor = df_filtered_visitors.groupby('visitor_id')['timestamp'].min()

            # 2. For each session (id_visit), find its earliest timestamp (indexed by id_visit)
            first_timestamp_per_visit = df_filtered_visitors.groupby('id_visit')['timestamp'].min()

            # 3. For each session (id_visit), get its visitor_id (indexed by id_visit)
            visitor_id_per_visit_map = df_filtered_visitors.groupby('id_visit')['visitor_id'].first()

            # Map the 'first_visit_time_per_visitor' (indexed by visitor_id)
            # onto the 'id_visit' index using the visitor_id_per_visit_map.
            visitor_overall_first_timestamp_for_session = visitor_id_per_visit_map.map(first_visit_time_per_visitor)

            # Now, compare the *session's first timestamp* with the *visitor's overall first timestamp*
            # Both Series are now indexed by 'id_visit', allowing direct comparison.
            is_first_session_for_visitor = (first_timestamp_per_visit == visitor_overall_first_timestamp_for_session)
            
            # Map this session-level flag back to the original (potentially unfiltered) df_all
            df_all['is_new_visitor_session'] = df_all['id_visit'].map(is_first_session_for_visitor).fillna(False)
            
        # --- END NEW VISITOR SESSION IDENTIFICATION ---

        return df_all

class BehaviorAnalyzer:
    # Features used for clustering and profiling visitors
    CLUSTERING_FEATURES = [
        'avg_session_depth', 'download_count', 'investor_interest_score',
        'content_breadth', 'visit_count', 'is_repeat_visitor',
        'has_download', 'has_ir', 'esg_visitor',
        'ir_only_visitor', 'frequent_downloader', 'deep_path_visitor', 'is_high_intent'
    ]

    def __init__(self, dataframe):
        self.df = dataframe.copy()
    
    def calculate_pageview_metrics(self):
        """Calculates key metrics related to pageviews and content engagement."""
        if self.df.empty:
            return {'metrics_data': {}}
            
        pageviews_per_category = self.df['section'].value_counts()
            
        # Filter out pages with 0 time_spent for duration calculation, as these might be bounces or quick redirects
        non_end_pages = self.df[self.df['time_spent'] > 0]
        avg_duration_per_category = non_end_pages.groupby('section')['time_spent'].mean().sort_values(ascending=False)
            
        # "End pages" are defined as pages with 0 time_spent, potentially exit pages
        end_pages = self.df[self.df['time_spent'] == 0]
        end_page_count_per_category = end_pages['section'].value_counts()
            
        total_visits = self.df['id_visit'].nunique()
        # End rate is the count of exit pages in a section divided by total visits (approximation of exit rate for sections)
        end_rate_per_category = (end_page_count_per_category / total_visits).sort_values(ascending=False)
            
        alpha = 100 
        real_interest_score = avg_duration_per_category.add(alpha * end_rate_per_category, fill_value=0).sort_values(ascending=False)
            
        metrics_data = {
            'top_pageview_count': int(pageviews_per_category.iloc[0]) if not pageviews_per_category.empty else 0,
            'top_pageview_section': pageviews_per_category.index[0] if not pageviews_per_category.empty else 'N/A',
            'top_duration_section': avg_duration_per_category.index[0] if not avg_duration_per_category.empty else 'N/A',
            'top_duration_time': round(avg_duration_per_category.iloc[0], 2) if not avg_duration_per_category.empty else 0.0,
            'highest_end_rate_section': end_rate_per_category.index[0] if not end_rate_per_category.empty else 'N/A',
            'highest_end_rate_value': round(end_rate_per_category.iloc[0], 4) if not end_rate_per_category.empty else 0.0,
            'top_interest_score_section': real_interest_score.index[0] if not real_interest_score.empty else 'N/A',
            'top_interest_score_value': round(real_interest_score.iloc[0], 2) if not real_interest_score.empty else 0.0,
        }
        return {'metrics_data': metrics_data}

    def analyze_session_path_length_and_repeat_visitors(self):
        """Calculates session length and identifies new vs. repeat visitors."""
        if self.df.empty:
            return {
                'metrics_data': {
                    'short_visits_count': 0,
                    'deep_visits_count': 0,
                    'repeat_visitors_count': 0,
                    'new_visits_count': 0,
                    'returning_visits_count': 0,
                    'new_depth': 0.0,
                    'returning_depth': 0.0
                }
            }
            
        visit_counts = self.df.groupby('id_visit').size() # Number of pageviews per visit
        # Define thresholds for short/deep visits based on percentiles (e.g., 99th percentile for normal sessions)
        session_length_threshold = visit_counts.quantile(0.99) if not visit_counts.empty else 0
        normal_sessions = visit_counts[visit_counts <= session_length_threshold] if not visit_counts.empty else pd.Series(dtype='int64')
        
        # Ensure short_visits and deep_visits are calculated before being used.
        # Short visits are sessions with only 1 pageview
        short_visits = (normal_sessions == 1).sum() if not normal_sessions.empty else 0 
        # Deep visits are sessions with 5 or more pageviews
        deep_visits = (normal_sessions >= 5).sum() if not normal_sessions.empty else 0
            
        visitor_visit_counts = self.df.groupby('visitor_id')['id_visit'].nunique()
        visit_count_threshold = visitor_visit_counts.quantile(0.99) if not visitor_visit_counts.empty else 0
        normal_visitors = visitor_visit_counts[visitor_visit_counts <= visit_count_threshold] if not visitor_visit_counts.empty else pd.Series(dtype='int64')
        repeat_visitors_count = (normal_visitors > 1).sum() # Visitors with more than one unique visit

        # --- IMPORTANT: USE THE 'is_new_visitor_session' COLUMN CREATED IN DataLoader ---
        new_visits = self.df[self.df['is_new_visitor_session']]
        returning_visits = self.df[~self.df['is_new_visitor_session']]
            
        new_visits_unique_count = new_visits['id_visit'].nunique()
        returning_visits_unique_count = returning_visits['id_visit'].nunique()
            
        new_depth = new_visits.groupby('id_visit').size().mean() if new_visits_unique_count > 0 else 0.0
        returning_depth = returning_visits.groupby('id_visit').size().mean() if returning_visits_unique_count > 0 else 0.0
            
        metrics_data = {
            'short_visits_count': int(short_visits),
            'deep_visits_count': int(deep_visits),
            'repeat_visitors_count': int(repeat_visitors_count),
            'new_visits_count': int(new_visits_unique_count),
            'returning_visits_count': int(returning_visits_unique_count),
            'new_depth': round(new_depth, 2),
            'returning_depth': round(returning_depth, 2)
        }
        return {'metrics_data': metrics_data}
    
    def analyze_most_common_paths(self):
        """Identifies and counts the most common sequential paths taken by visitors."""
        if self.df.empty:
            return {'metrics_data': {'top_common_paths': {}, 'top_common_path_string': 'N/A', 'top_common_path_count': 0}}
            
        df_sorted = self.df.sort_values(['id_visit', 'timestamp'])
        # Concatenate sections within each visit to form a path string
        df_sorted['path'] = df_sorted.groupby('id_visit')['section'].transform(lambda x: ' -> '.join(x.astype(str)))
            
        path_counts = df_sorted['path'].value_counts()
        top_paths = path_counts.head(10) # Get top 10 most common paths
            
        metrics_data = {
            'top_common_paths': top_paths.to_dict(),
            'top_common_path_string': top_paths.index[0] if not top_paths.empty else 'N/A',
            'top_common_path_count': int(top_paths.iloc[0]) if not top_paths.empty else 0
        }
        return {'metrics_data': metrics_data}

    def compare_new_vs_returning(self):
        """Compares engagement metrics (depth, bounce rate) for new vs. returning visitors."""
        if self.df.empty:
            return {'metrics_data': {'new_depth_comparison': 0.0, 'returning_depth_comparison': 0.0, 'new_bounce_rate_comparison': 0.0, 'returning_bounce_rate_comparison': 0.0}}
            
        # --- IMPORTANT: USE THE 'is_new_visitor_session' COLUMN CREATED IN DataLoader ---
        new_visits_df = self.df[self.df['is_new_visitor_session']]
        returning_visits_df = self.df[~self.df['is_new_visitor_session']]
            
        new_depth = new_visits_df.groupby('id_visit').size().mean() if not new_visits_df.empty else 0.0
        returning_depth = returning_visits_df.groupby('id_visit').size().mean() if not returning_visits_df.empty else 0.0
            
        # Recursively call bounce rate calculation for new and returning segments
        new_bounce_result = BehaviorAnalyzer(new_visits_df).calculate_bounce_rate()
        returning_bounce_result = BehaviorAnalyzer(returning_visits_df).calculate_bounce_rate()
            
        metrics_data = {
            'new_depth_comparison': round(new_depth, 2),
            'returning_depth_comparison': round(returning_depth, 2),
            'new_bounce_rate_comparison': round(new_bounce_result['metrics_data']['overall_bounce_rate'], 4),
            'returning_bounce_rate_comparison': round(returning_bounce_result['metrics_data']['overall_bounce_rate'], 4),
        }
        return {'metrics_data': metrics_data}
    
    def find_optimal_clusters(self, X_scaled, k_range=range(3, 11)):
        """
        Uses the elbow method and silhouette score to suggest an optimal number of clusters.
        Returns the optimal n_clusters and a DataFrame of scores.
        """
        if X_scaled.shape[0] <= max(k_range):
            return None, "Not enough data points to test the full range of clusters."
        
        distortions = []
        silhouette_scores = []
        
        for i in k_range:
            kmeans = KMeans(n_clusters=i, random_state=42, n_init='auto')
            kmeans.fit(X_scaled)
            distortions.append(kmeans.inertia_)
            
            if i > 1:
                score = silhouette_score(X_scaled, kmeans.labels_)
                silhouette_scores.append(score)
            else: 
                silhouette_scores.append(np.nan)

        elbow_index = np.argmin(np.diff(distortions, 2)) + 2
        
        if not silhouette_scores:
            best_silhouette_k = k_range[0] 
        else:
            best_silhouette_k = k_range[np.argmax(silhouette_scores)]

        score_df = pd.DataFrame({
            'n_clusters': list(k_range),
            'Distortion (Inertia)': distortions,
            'Silhouette Score': silhouette_scores 
        })

        optimal_n_clusters = best_silhouette_k
        
        return optimal_n_clusters, score_df

    def compare_with_gmm(self, X_scaled, n_clusters, max_gmm_components=10):
        """
        Compares K-Means with a Gaussian Mixture Model (GMM) using BIC.
        Returns the BIC scores for both models.
        """
        from sklearn.mixture import GaussianMixture
        
        if X_scaled.shape[0] < n_clusters or X_scaled.shape[0] < 2:
            return {'kmeans_bic': np.nan, 'gmm_bic': np.nan}

        # K-Means is not natively compared with BIC, but we can compute it manually
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
        kmeans.fit(X_scaled)
        kmeans_labels = kmeans.labels_
        kmeans_bic = np.nan # BIC is not ideal for K-Means, but can be approximated.
                        # Let's just state this in the analysis.
        
        # GMM naturally supports BIC. Iterate to find the best GMM.
        bic_scores = []
        for i in range(1, min(max_gmm_components, X_scaled.shape[0] + 1)):
            try:
                gmm = GaussianMixture(n_components=i, random_state=42, n_init=10)
                gmm.fit(X_scaled)
                bic_scores.append(gmm.bic(X_scaled))
            except ValueError: # Occurs if n_components is too large for the data
                bic_scores.append(np.nan)
                
        best_gmm_bic = min(bic_scores) if bic_scores else np.nan
        best_gmm_n_components = np.argmin(bic_scores) + 1 if bic_scores else np.nan
        
        return {'kmeans_bic': kmeans_bic, 'best_gmm_bic': best_gmm_bic, 'best_gmm_n_components': best_gmm_n_components}

    def perform_clustering_and_profiling(self, n_clusters=5):
        """
        Performs K-Means clustering on visitor profiles with enhancements from the paper.
        Now includes a search for the optimal number of clusters and GMM comparison.
        """
        insights = []
        metrics_data = {'cluster_summary': {}}
        
        visitor_profiles_results = self.generate_visitor_profiles()
        visitor_profiles_df = visitor_profiles_results['visitor_profiles_df']
        
        features_for_clustering = [f for f in self.CLUSTERING_FEATURES if f in visitor_profiles_df.columns]
        X = visitor_profiles_df[features_for_clustering].copy()
        X = X.fillna(0)
        
        if X.empty or X.shape[0] < 2:
            warning_msg = "Warning: Not enough valid visitor profiles to perform clustering."
            insights.append(warning_msg)
            return {'visitor_profiles_with_clusters': visitor_profiles_df, 'cluster_descriptions': {}, 'insights': insights, 'metrics_data': metrics_data}

        # --- Robustness check for zero-variance features ---
        zero_variance_cols = X.columns[X.std() < 1e-9]
        if not zero_variance_cols.empty:
            warning_msg = f"Warning: Features with zero variance found and filtered: {list(zero_variance_cols)}"
            insights.append(warning_msg)
            X = X.drop(columns=zero_variance_cols)
            
        if X.shape[1] == 0:
            warning_msg = "Warning: All features had zero variance. Cannot perform clustering."
            insights.append(warning_msg)
            return {'visitor_profiles_with_clusters': visitor_profiles_df, 'cluster_descriptions': {}, 'insights': insights, 'metrics_data': metrics_data}
            
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # --- Find optimal number of clusters using elbow & silhouette scores ---
        optimal_n_clusters_found, score_metrics = self.find_optimal_clusters(X_scaled)
        if isinstance(score_metrics, pd.DataFrame):
            insights.append(f"Optimal cluster number suggested by silhouette analysis is: {optimal_n_clusters_found}")
            metrics_data['clustering_scores'] = score_metrics.to_dict('records')
        else:
            insights.append(f"Warning: Could not determine optimal cluster number: {score_metrics}")
            optimal_n_clusters_found = n_clusters # Fall back to the default
        
        n_clusters_to_use = min(optimal_n_clusters_found, X.shape[0]-1) if optimal_n_clusters_found else n_clusters
        
        # ---  Integrate GMM comparison to complement K-Means ---
        if X_scaled.shape[0] > 1 and n_clusters_to_use > 1:
            try:
                gmm_results = self.compare_with_gmm(X_scaled, n_clusters_to_use)
                if pd.notna(gmm_results['best_gmm_bic']):
                    insights.append(f"GMM analysis suggests a best fit with {gmm_results['best_gmm_n_components']} components (BIC: {gmm_results['best_gmm_bic']:.2f}). This provides a comparative check for the K-Means clustering assumptions.")
                    metrics_data['gmm_comparison'] = gmm_results
            except Exception as e:
                insights.append(f"Warning: GMM comparison failed due to an error: {e}")

        if n_clusters_to_use <= 1:
            insights.append("Clustering resulted in only one cluster. No meaningful segmentation.")
            visitor_profiles_df['cluster'] = 0
            return {'visitor_profiles_with_clusters': visitor_profiles_df, 'cluster_descriptions': {}, 'insights': insights, 'metrics_data': metrics_data}


        # The rest of your existing K-Means, t-SNE, and persona generation logic...
        kmeans = KMeans(n_clusters=n_clusters_to_use, random_state=42, n_init=10)
        visitor_profiles_df['cluster'] = kmeans.fit_predict(X_scaled)
        
        cluster_centers_scaled = kmeans.cluster_centers_
        cluster_centers = pd.DataFrame(scaler.inverse_transform(cluster_centers_scaled), columns=X.columns)
        cluster_descriptions = {}

        for i in range(n_clusters_to_use):
            cluster_data = cluster_centers.iloc[i]
            desc_parts = []
            cluster_name = f"Cluster {i}"
            
          
            if i == 0:
                cluster_name = "Highly Engaged Core Investors"
            elif i == 1:
                cluster_name = "Low-Engagement New Visitors"
            elif i == 2:
                cluster_name = "High-Engagement New Visitors"
            elif i == 3:
                cluster_name = "High-Engagement Core Investors"
            elif i == 4:
                cluster_name = "Low-Engagement Core Investors"
            elif i == 5:
                cluster_name = "High-Engagement New Visitors"
            elif i == 6:
                cluster_name = "High-Engagement New Visitors"
            elif i == 7:
                cluster_name = "High-Engagement New Visitors"
            elif i == 8:
                cluster_name = "High-Engagement New Visitors"
            elif i == 9:
                cluster_name = "High-Engagement New Visitors"
            if 'avg_session_depth' in cluster_data: desc_parts.append(f"Avg session depth: {cluster_data['avg_session_depth']:.1f} pages")
            
            full_description = f"**{cluster_name}:** " + " | ".join(desc_parts) + "."
            cluster_descriptions[f'Cluster {i}'] = full_description
            metrics_data['cluster_summary'][f'Cluster {i}'] = cluster_data.to_dict()

        try:
            if X_scaled.shape[0] > 1 and X_scaled.shape[1] >= 2 and n_clusters_to_use > 1:
                perplexity = min(30, X_scaled.shape[0] - 1)
                tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
                X_tsne = tsne.fit_transform(X_scaled)
                visitor_profiles_df['tsne_x'] = X_tsne[:, 0]
                visitor_profiles_df['tsne_y'] = X_tsne[:, 1]
            else:
                insights.append("Warning: Not enough data points or features to perform t-SNE visualization for clustering.")
        except Exception as e:
            insights.append(f"Error: Could not visualize clusters using t-SNE: {e}.")

        return {
            'visitor_profiles_with_clusters': visitor_profiles_df,
            'cluster_descriptions': cluster_descriptions,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def compare_by_country(self, top_n_countries=5):
        """Compares engagement metrics across top visitor countries."""
        if self.df.empty:
            return {'metrics_data': {'highest_bounce_country': 'N/A', 'highest_bounce_country_rate': 0.0, 'deepest_engagement_country': 'N/A', 'deepest_engagement_country_depth': 0.0}}
            
        country_pageviews = self.df.groupby('location_country')['id_visit'].count().sort_values(ascending=False)
        top_countries = country_pageviews.head(top_n_countries).index.tolist()
            
        country_bounce_rates = {}
        country_avg_depth = {}
            
        for country in top_countries:
            country_df = self.df[self.df['location_country'] == country]
            if not country_df.empty:
                country_analyzer = BehaviorAnalyzer(country_df)
                country_bounce_result = country_analyzer.calculate_bounce_rate()
                country_bounce_rates[country] = country_bounce_result['metrics_data']['overall_bounce_rate']
                
                country_visits = country_df.groupby('id_visit').size()
                country_avg_depth[country] = country_visits.mean() if len(country_visits) > 0 else 0.0
            
        bounce_df = pd.DataFrame(list(country_bounce_rates.items()), columns=['Country', 'Bounce_Rate']).sort_values(by='Bounce_Rate', ascending=False)
        depth_df = pd.DataFrame(list(country_avg_depth.items()), columns=['Country', 'Avg_Depth']).sort_values(by='Avg_Depth', ascending=False)
            
        metrics_data = {
            'highest_bounce_country': bounce_df.iloc[0]['Country'] if not bounce_df.empty else 'N/A',
            'highest_bounce_country_rate': round(bounce_df.iloc[0]['Bounce_Rate'], 4) if not bounce_df.empty else 0.0,
            'deepest_engagement_country': depth_df.iloc[0]['Country'] if not depth_df.empty else 'N/A',
            'deepest_engagement_country_depth': round(depth_df.iloc[0]['Avg_Depth'], 2) if not depth_df.empty else 0.0
        }
        return {'metrics_data': metrics_data}
    
    def calculate_bounce_rate(self):
        """Calculates the overall bounce rate and bounce rate per section."""
        if self.df.empty:
            return {'metrics_data': {'overall_bounce_rate': 0.0, 'highest_bounce_section': 'N/A', 'highest_bounce_value': 0.0}}
            
        visit_counts = self.df.groupby('id_visit').size() 
        bounces = visit_counts[visit_counts == 1].count()
        total_visits = visit_counts.count()
            
        overall_bounce_rate = bounces / total_visits if total_visits > 0 else 0.0
            
        section_bounce_rates = {}
        for section, group in self.df.groupby('section'):
            section_visit_counts = group.groupby('id_visit').size()
            section_bounces = section_visit_counts[section_visit_counts == 1].count()
            section_total_visits = section_visit_counts.count()
            bounce_rate = section_bounces / section_total_visits if section_total_visits > 0 else 0
            section_bounce_rates[section] = bounce_rate
            
        bounce_df = pd.DataFrame(list(section_bounce_rates.items()), columns=['section', 'bounce_rate']).sort_values(by='bounce_rate', ascending=False)
            
        metrics_data = {
            'overall_bounce_rate': round(overall_bounce_rate, 4),
            'highest_bounce_section': bounce_df.iloc[0]['section'] if not bounce_df.empty else 'N/A',
            'highest_bounce_value': round(bounce_df.iloc[0]['bounce_rate'], 4) if not bounce_df.empty else 0.0
        }
        return {'metrics_data': metrics_data}

    def analyze_download_behavior(self):
        """Analyzes user download behavior across sections and subsections."""
        if self.df.empty:
            return {'metrics_data': {'top_download_section': 'N/A', 'top_download_count': 0, 'top_download_subsection': 'N/A', 'top_download_subsection_count': 0, 'top_section_with_downloads': 'N/A', 'top_section_with_downloads_count': 0, 'top_conv_subsection': 'N/A', 'top_conv_rate': 0.0}}
            
        download_pages = self.df[self.df['download_flag'] == True]
        download_section_counts = download_pages['section'].value_counts()
        download_subsection_counts = download_pages['sub-section'].value_counts()
            
        # Identify visits that contain any download
        download_id_visits = self.df[self.df['download_flag'] == True]['id_visit'].unique()
        related_visits = self.df[self.df['id_visit'].isin(download_id_visits)]
        section_counts_with_downloads = related_visits['section'].value_counts()
            
        # Calculate download conversion rate per subsection
        subsection_downloads = download_pages['sub-section'].value_counts()
        subsection_total = self.df['sub-section'].value_counts()
        subsection_download_rate = (subsection_downloads / subsection_total).dropna() # Handle cases where denominator is zero
            
        metrics_data = {
            'top_download_section': download_section_counts.index[0] if not download_section_counts.empty else 'N/A',
            'top_download_count': int(download_section_counts.iloc[0]) if not download_section_counts.empty else 0,
            'top_download_subsection': download_subsection_counts.index[0] if not download_subsection_counts.empty else 'N/A',
            'top_download_subsection_count': int(download_subsection_counts.iloc[0]) if not download_subsection_counts.empty else 0,
            'top_section_with_downloads': section_counts_with_downloads.index[0] if not section_counts_with_downloads.empty else 'N/A',
            'top_section_with_downloads_count': int(section_counts_with_downloads.iloc[0]) if not section_counts_with_downloads.empty else 0,
            'top_conv_subsection': subsection_download_rate.index[0] if not subsection_download_rate.empty else 'N/A',
            'top_conv_rate': round(subsection_download_rate.iloc[0], 4) if not subsection_download_rate.empty else 0.0,
        }
        return {'metrics_data': metrics_data}
    
    def analyze_content_interest_by_cluster(self):
        if self.df.empty:
            return pd.DataFrame()
        if 'cluster' not in self.df.columns:
            print("Warning: 'cluster' column not found in DataFrame for analyze_content_interest_by_cluster.")
            return pd.DataFrame()
        cluster_content_metrics = self.df.groupby(['cluster', 'section']).agg(
            pageviews=('id_visit', 'count'),
            total_time_spent=('time_spent', 'sum'),
            downloads=('download_flag', lambda x: x.sum() if pd.api.types.is_bool_dtype(x) else (x == True).sum()),
            unique_visitors=('visitor_id', 'nunique')
        ).reset_index()

        time_weight = 0.1
        download_weight = 5.0 

        cluster_content_metrics['interest_score'] = (
            cluster_content_metrics['pageviews'] +
            (cluster_content_metrics['total_time_spent'] * time_weight) +
            (cluster_content_metrics['downloads'] * download_weight)
        )

        top_sections_per_cluster = []
        for cluster_id in cluster_content_metrics['cluster'].unique():
            cluster_data = cluster_content_metrics[cluster_content_metrics['cluster'] == cluster_id].sort_values(by='interest_score', ascending=False)
            top_3_sections = cluster_data.head(3)
            for idx, row in top_3_sections.iterrows():
                top_sections_per_cluster.append({
                    'Cluster ID': cluster_id,
                    'Section': row['section'],
                    'Interest Score': round(row['interest_score'], 2),
                    'Rank': len(top_sections_per_cluster) % 3 + 1 # Rank within this cluster's top sections
                })
        
        return pd.DataFrame(top_sections_per_cluster)
        
    def perform_funnel_analysis(self):
        """Performs a simple funnel analysis from home page to download."""
        if self.df.empty:
            return {'metrics_data': {'funnel_conversion_rate': 0.0, 'funnel_start_visits': 0, 'funnel_converted_visits': 0}}
            
        # Determine the first page of each visit (the landing page)
        df_sorted = self.df.sort_values(['id_visit', 'timestamp'])
        landing_pages = df_sorted.groupby('id_visit').first().reset_index()

        # Get the unique visitors who started their session on a home page
        home_visits = landing_pages[landing_pages['section'].str.lower() == 'home page']['id_visit'].unique()
            
        # Get the unique visitors from the 'home_visits' group who also have a download
        download_visits = self.df[(self.df['id_visit'].isin(home_visits)) & (self.df['download_flag'] == True)]['id_visit'].unique()
            
        # Calculate the funnel conversion rate
        funnel_rate = len(download_visits) / len(home_visits) if len(home_visits) > 0 else 0.0
            
        metrics_data = {
            'funnel_conversion_rate': round(funnel_rate, 4),
            'funnel_start_visits': int(len(home_visits)),
            'funnel_converted_visits': int(len(download_visits))
        }
        return {'metrics_data': metrics_data}
    
    def generate_visitor_profiles(self):
        """Generates detailed profiles for each unique visitor, including derived behavioral features and tags."""
        df_profiling = self.df.dropna(subset=['visitor_id'])
        if df_profiling.empty:
            empty_profiles = pd.DataFrame(columns=['visitor_id'] + self.CLUSTERING_FEATURES + ['top_visited_section', 'user_tags', 'company'])
            return {'visitor_profiles_df': empty_profiles, 'metrics_data': {}}
            
        all_visitor_ids = df_profiling['visitor_id'].unique()
            
        # Link company info to visitor_id (take the first company associated if multiple exist)
        visitor_company = df_profiling.groupby('visitor_id')['company'].first().rename('company')

        # Calculate various behavioral features for each visitor
        session_depth = df_profiling.groupby(['visitor_id', 'id_visit']).size().groupby('visitor_id').mean().rename('avg_session_depth')
        download_count = df_profiling[df_profiling['download_flag'] == True].groupby('visitor_id').size().rename('download_count')
            
        # Determine the most visited section for each visitor
        top_section_raw = df_profiling.groupby(['visitor_id', 'section']).size().reset_index(name='count')
        top_section_raw = top_section_raw.sort_values(['visitor_id', 'count'], ascending=[True, False])
        top_section = top_section_raw.groupby('visitor_id').first()['section'].rename('top_visited_section')
            
        # Calculate investor interest score based on IR section pageviews
        ir_score = df_profiling[df_profiling['section'].str.contains('Investor Relations', na=False, case=False)].groupby('visitor_id').size().rename('investor_interest_score')
        ir_score = ir_score.reindex(all_visitor_ids, fill_value=0) # Fill missing visitors with 0 score
            
        content_breadth = df_profiling.groupby('visitor_id')['section'].nunique().rename('content_breadth')
        content_breadth = content_breadth.reindex(all_visitor_ids, fill_value=0)
            
        visit_count_series = df_profiling.groupby('visitor_id')['id_visit'].nunique().rename('visit_count')
        visit_count_series = visit_count_series.reindex(all_visitor_ids, fill_value=0)
            
        # --- RE-CALCULATE short_visits and deep_visits for generate_visitor_profiles ---
        # This resolves the NameError for short_visits and deep_visits
        visit_counts_for_profiles = df_profiling.groupby('id_visit').size()
        session_length_threshold_for_profiles = visit_counts_for_profiles.quantile(0.99) if not visit_counts_for_profiles.empty else 0
        normal_sessions_for_profiles = visit_counts_for_profiles[visit_counts_for_profiles <= session_length_threshold_for_profiles] if not visit_counts_for_profiles.empty else pd.Series(dtype='int64')
        short_visits_calc = (normal_sessions_for_profiles == 1).sum() if not normal_sessions_for_profiles.empty else 0 
        deep_visits_calc = (normal_sessions_for_profiles >= 5).sum() if not normal_sessions_for_profiles.empty else 0
        
        # --- IMPORTANT: USE THE 'is_new_visitor_session' COLUMN CREATED IN DataLoader ---
        # This resolves the NameError for is_new_visitor_session_series and is_repeat_visitor
        is_new_visitor_session_series = df_profiling.groupby('visitor_id')['is_new_visitor_session'].apply(lambda x: (~x).any()).astype(int).rename('is_repeat_visitor')
        is_new_visitor_session_series = is_new_visitor_session_series.reindex(all_visitor_ids, fill_value=0)

        # --- Re-defined has_download, has_ir, and esg_visitor_flag ---
        # This resolves the NameError for these variables.
        has_download = (download_count > 0).astype(int).rename('has_download').reindex(all_visitor_ids, fill_value=0)
        has_ir = (ir_score >= 2).astype(int).rename('has_ir').reindex(all_visitor_ids, fill_value=0)
        esg_visitors_in_profile = df_profiling[df_profiling['section'].str.contains('ESG', na=False, case=False)]['visitor_id'].unique()
        esg_visitor_flag = pd.Series(all_visitor_ids).isin(esg_visitors_in_profile).astype(int).set_axis(all_visitor_ids).rename('esg_visitor')


        # Combine all features into a single DataFrame
        visitor_profiles_df = pd.concat([
            session_depth, download_count, top_section, ir_score, content_breadth,
            visit_count_series, is_new_visitor_session_series, has_download, has_ir, esg_visitor_flag, visitor_company
        ], axis=1).reset_index(names=['visitor_id'])
            
        # Ensure all clustering features are numeric and handle NaNs
        for col in self.CLUSTERING_FEATURES:
            if col in visitor_profiles_df.columns:
                visitor_profiles_df[col] = pd.to_numeric(visitor_profiles_df[col], errors='coerce').fillna(0)
            
        # Derive composite behavioral flags
        visitor_profiles_df['is_high_intent'] = ((visitor_profiles_df['has_download'] > 0) & (visitor_profiles_df['has_ir'] > 0)).astype(int)
        visitor_profiles_df['ir_only_visitor'] = ((visitor_profiles_df['content_breadth'] == 1) & (visitor_profiles_df['top_visited_section'] == 'Investor Relations')).astype(int)
        visitor_profiles_df['frequent_downloader'] = (visitor_profiles_df['download_count'] > 3).astype(int)
        visitor_profiles_df['deep_path_visitor'] = (visitor_profiles_df['avg_session_depth'] > 5).astype(int)
            
        # Generate user tags based on derived features
        def get_user_tags(row):
            tags = []
            if row['ir_only_visitor']: tags.append('IR-Only Browser')
            if row['frequent_downloader']: tags.append('Frequent Downloader')
            if row['deep_path_visitor']: tags.append('Deep Path Visitor')
            if row['esg_visitor'] > 0: tags.append('ESG-Focused')
            if row['avg_session_depth'] <= 1.1 and row['avg_session_depth'] > 0: tags.append('High Bounce Rate') # Very low depth indicates high bounce
            if row['is_high_intent']: tags.append('High-Intent')
            if row['is_repeat_visitor']: tags.append('Repeat Visitor')
            else: tags.append('New Visitor')
            return ','.join(tags)
            
        visitor_profiles_df['user_tags'] = visitor_profiles_df.apply(get_user_tags, axis=1)
        visitor_profiles_df = visitor_profiles_df.dropna(subset=['visitor_id']) 
            
        if not visitor_profiles_df.empty:
            visitor_profiles_df['visitor_id'] = visitor_profiles_df['visitor_id'].astype(int)

        # Aggregate metrics for profiling insights
        num_high_intent = visitor_profiles_df['is_high_intent'].sum()
        num_frequent_downloaders = visitor_profiles_df['frequent_downloader'].sum()
        num_ir_only = visitor_profiles_df['ir_only_visitor'].sum()
        num_esg_visitors = visitor_profiles_df['esg_visitor'].sum()
            
        metrics_data = {
            'short_visits_count': int(short_visits_calc), 
            'deep_visits_count': int(deep_visits_calc),
            'num_high_intent_users': int(num_high_intent),
            'num_frequent_downloaders': int(num_frequent_downloaders),
            'num_ir_only_visitors': int(num_ir_only),
            'num_esg_visitors': int(num_esg_visitors),
        }
        return {'visitor_profiles_df': visitor_profiles_df, 'metrics_data': metrics_data}

    def calculate_average_depth(self):
        """Calculates the average number of pages viewed per visit."""
        if self.df.empty:
            return {'metrics_data': {'avg_depth_per_visit': 0.0}}
        visit_counts = self.df.groupby('id_visit').size() # Number of pageviews per visit
        avg_depth_per_visit = visit_counts.mean() if not visit_counts.empty else 0.0
        metrics_data = {'avg_depth_per_visit': round(avg_depth_per_visit, 2)}
        return {'metrics_data': metrics_data}

    def analyze_unique_investors(self):
        """Identifies top sections by unique investor (visitor_id) count."""
        if self.df.empty:
            return {'metrics_data': {'top_unique_investor_section': 'N/A', 'top_unique_investor_count': 0}}
        unique_investor_per_category = self.df.groupby('section')['visitor_id'].nunique()
        metrics_data = {
            'top_unique_investor_section': unique_investor_per_category.idxmax() if not unique_investor_per_category.empty else 'N/A',
            'top_unique_investor_count': int(unique_investor_per_category.max()) if not unique_investor_per_category.empty else 0
        }
        return {'metrics_data': metrics_data}

    def analyze_sub_section_details(self):
        """Provides detailed metrics for subsections."""
        if self.df.empty:
            return {'metrics_data': {'top_sub_pv': 'N/A', 'top_sub_pv_count': 0, 'top_sub_duration': 'N/A', 'top_sub_duration_time': 0.0, 'top_report_visitor_type': 'N/A', 'top_report_visitor_count': 0}}
            
        subsection_pageviews = self.df['sub-section'].value_counts()
        non_end_pages = self.df[self.df['time_spent'] > 0]
        avg_duration_subsection = non_end_pages.groupby('sub-section')['time_spent'].mean()
            
        # Analyze specific 'Reports & Presentations' subsection for visitor companies
        report_visitors = self.df[self.df['sub-section'] == 'Reports & Presentations']['ultimate_parent_name'].value_counts()
            
        metrics_data = {
            'top_sub_pv': subsection_pageviews.index[0] if not subsection_pageviews.empty else 'N/A',
            'top_sub_pv_count': int(subsection_pageviews.iloc[0]) if not subsection_pageviews.empty else 0,
            'top_sub_duration': avg_duration_subsection.idxmax() if not avg_duration_subsection.empty else 'N/A',
            'top_sub_duration_time': round(avg_duration_subsection.max(), 2) if not avg_duration_subsection.empty else 0.0,
            'top_report_visitor_type': report_visitors.index[0] if not report_visitors.empty else 'N/A',
            'top_report_visitor_count': int(report_visitors.iloc[0]) if not report_visitors.empty else 0,
        }
        return {'metrics_data': metrics_data}
    
    def analyze_session_paths(self):
        """Identifies and counts the most common sequential paths taken by visitors."""
        if self.df.empty:
            return {'metrics_data': {'top_common_paths': {}, 'top_common_path_string': 'N/A', 'top_common_path_count': 0}}
            
        df_sorted = self.df.sort_values(['id_visit', 'timestamp'])
        df_sorted['path'] = df_sorted.groupby('id_visit')['section'].transform(lambda x: ' -> '.join(x.astype(str)))
            
        path_counts = df_sorted['path'].value_counts()
        top_paths = path_counts.head(10) 
            
        metrics_data = {
            'top_common_paths': top_paths.to_dict(),
            'top_common_path_string': top_paths.index[0] if not top_paths.empty else 'N/A',
            'top_common_path_count': int(top_paths.iloc[0]) if not top_paths.empty else 0
        }
        return {'metrics_data': metrics_data}
        
    def analyze_time_distribution(self):
        """Analyzes visitor activity by hour of day and day of week."""
        if self.df.empty:
            return {'metrics_data': {'peak_hour': 0, 'peak_hour_count': 0, 'peak_weekday': 'N/A', 'peak_weekday_count': 0}}
            
        hourly_counts = self.df['timestamp'].dt.hour.value_counts().sort_index()
        weekday_counts = self.df['timestamp'].dt.dayofweek.value_counts().sort_index()
        weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            
        metrics_data = {
            'peak_hour': int(hourly_counts.idxmax()) if not hourly_counts.empty else 0,
            'peak_hour_count': int(hourly_counts.max()) if not hourly_counts.empty else 0,
            'peak_weekday': weekdays[weekday_counts.idxmax()] if not weekday_counts.empty else 'N/A',
            'peak_weekday_count': int(weekday_counts.max()) if not weekday_counts.empty else 0
        }
        return {'metrics_data': metrics_data}
        
    def generate_user_story(self, visitor_id, overall_metrics_dict=None, overall_visitor_averages=None):
        """Generates a textual 'story' describing an individual visitor's behavior."""
        visitor_profiles_results = self.generate_visitor_profiles()
        visitor_profiles = visitor_profiles_results['visitor_profiles_df']
        user_profile = visitor_profiles[visitor_profiles['visitor_id'] == visitor_id] 
            
        if user_profile.empty:
            return "No user story generated for this visitor ID."
            
        user_profile = user_profile.iloc[0]
        story_lines = [f"## User Profile Report: Visitor ID {int(user_profile['visitor_id'])}"]
            
        story_lines.append(f"**Visitor Company**: {user_profile['company']}")
            
        avg_depth_overall = overall_metrics_dict.get('avg_depth_results', {}).get('avg_depth_per_visit', None)
        overall_bounce_rate = overall_metrics_dict.get('bounce_results', {}).get('overall_bounce_rate', None)
        new_depth_overall = overall_metrics_dict.get('new_vs_returning_results', {}).get('new_depth_comparison', None)
        returning_depth_overall = overall_metrics_dict.get('new_vs_returning_results', {}).get('returning_depth_comparison', None)
        overall_avg_downloads = overall_visitor_averages.get('download_count', 0.0) if overall_visitor_averages else 0.0
        overall_avg_ir_score = overall_visitor_averages.get('investor_interest_score', 0.0) if overall_visitor_averages else 0.0
        overall_avg_content_breadth = overall_visitor_averages.get('content_breadth', 0.0) if overall_visitor_averages else 0.0
            
        if user_profile['is_repeat_visitor']:
            story_lines.append("This individual is a **returning visitor** to our website.")
            if returning_depth_overall is not None:
                if user_profile['avg_session_depth'] < returning_depth_overall:
                    story_lines.append(f"However, their average session depth of {user_profile['avg_session_depth']:.2f} pages is somewhat lower than the average for returning visitors ({returning_depth_overall:.2f} pages). This might suggest a very specific goal-oriented visit or a change in their typical engagement.")
                elif user_profile['avg_session_depth'] > returning_depth_overall:
                    story_lines.append(f"Their average session depth of {user_profile['avg_session_depth']:.2f} pages is higher than the average for returning visitors ({returning_depth_overall:.2f} pages), indicating strong ongoing engagement.")
        else:
            story_lines.append("This individual is a **new visitor** to our website.")
            if new_depth_overall is not None:
                if user_profile['avg_session_depth'] > new_depth_overall:
                    story_lines.append(f"Notably, their average session depth of {user_profile['avg_session_depth']:.2f} pages is significantly higher than the average for new visitors ({new_depth_overall:.2f} pages), suggesting strong initial interest.")
                elif user_profile['avg_session_depth'] < new_depth_overall and user_profile['avg_session_depth'] > 0:
                    story_lines.append(f"Their average session depth of {user_profile['avg_session_depth']:.2f} pages is lower than the average for new visitors ({new_depth_overall:.2f} pages), which aligns with typical new user behavior of quick exploration or high bounce.")

        story_lines.append(f"They typically view an average of **{user_profile['avg_session_depth']:.2f} pages per session**, indicating their typical level of engagement within each visit.")
        if avg_depth_overall is not None:
            if user_profile['avg_session_depth'] > avg_depth_overall:
                story_lines.append(f"This is **higher than the overall website average session depth** of {avg_depth_overall:.2f} pages.")
            elif user_profile['avg_session_depth'] < avg_depth_overall:
                story_lines.append(f"This is **lower than the overall website average session depth** of {avg_depth_overall:.2f} pages.")

        if user_profile['download_count'] > 0:
            story_lines.append(f"A key action for this user is downloading content, as they have initiated **{int(user_profile['download_count'])} downloads**.")
            if user_profile['frequent_downloader']:
                story_lines.append("They are categorized as a **frequent downloader**, suggesting a high propensity for consuming downloadable resources.")
            if user_profile['download_count'] > overall_avg_downloads:
                story_lines.append(f"This is notably higher than the overall average of {overall_avg_downloads:.1f} downloads per visitor.")
            else:
                story_lines.append(f"This is in line with or lower than the overall average of {overall_avg_downloads:.1f} downloads per visitor.")
        else:
            story_lines.append("They have not initiated any downloads during their sessions, which might indicate different information needs or engagement patterns.")

        if user_profile['top_visited_section'] != 'No_Section_Determined':
            story_lines.append(f"Their primary area of interest on the site is the **'{user_profile['top_visited_section']}' section**, as indicated by the highest proportion of their pageviews.")

        if user_profile['ir_only_visitor']:
            story_lines.append("This user **primarily browses Investor Relations content only** (tagged as an 'IR-only visitor').")
        elif user_profile['investor_interest_score'] > 0:
            story_lines.append(f"They show a notable interest in Investor Relations, with **{int(user_profile['investor_interest_score'])} pageviews** in this section, indicating they are likely tracking company performance or news.")
            if user_profile['investor_interest_score'] > overall_avg_ir_score:
                story_lines.append(f"This is higher than the overall average IR interest score of {overall_avg_ir_score:.1f}.")
            else:
                story_lines.append(f"This is in line with or lower than the overall average IR interest score of {overall_avg_ir_score:.1f}.")

        if user_profile['esg_visitor']:
            story_lines.append("They demonstrate a **strong interest in ESG-related content**.")

        if user_profile['deep_path_visitor']:
            story_lines.append("This user typically engages in **deep browse paths**, exploring more than 5 pages per session, suggesting a thorough and investigative approach to content consumption.")
        elif user_profile['avg_session_depth'] <= 1.1 and user_profile['avg_session_depth'] > 0:
            story_lines.append("Conversely, this user exhibits **high bounce rate behavior**, often leaving after viewing just one page, which could signal a quick search for specific information or a lack of immediate relevance.")
            if overall_bounce_rate is not None and user_profile['avg_session_depth'] == 1:
                story_lines.append(f"Their behavior contributes to the overall website bounce rate, which is {overall_bounce_rate:.2%}.")
                
        user_visits_for_path = self.df[self.df['visitor_id'] == visitor_id].sort_values(['id_visit', 'timestamp'])
        user_visits_cleaned = user_visits_for_path.dropna(subset=['section'])
        user_visits_cleaned = user_visits_cleaned[user_visits_cleaned['section'].str.lower() != 'nan']
            
        if not user_visits_cleaned.empty:
            user_visits_cleaned['path'] = user_visits_cleaned.groupby('id_visit')['section'].transform(lambda x: ' -> '.join(x.astype(str)))
            unique_paths = user_visits_cleaned.drop_duplicates(subset='path')['path']
            if not unique_paths.empty:
                story_lines.append("\n**Typical Session Paths Observed:**")
                story_lines.append("This visitor typically follows these paths to explore our website:")
                for i, path in enumerate(unique_paths.head(3)):
                    story_lines.append(f"- {path}")
            else:
                story_lines.append("\nNo distinct session paths found for this user within the dataset.")
        else:
            story_lines.append("\nPath data for this visitor is incomplete and cannot be analyzed.")

        user_story = "\n".join(story_lines)
        return user_story



class PredictiveModeler:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.model = None
        self.features = []

    def prepare_longitudinal_data(self):
        """
        Engineers time-series features and prepares the dataset for a predictive model.
        This method predicts a 'future_download_probability' based on past behavior.
        """
        df = self.df.sort_values(['visitor_id', 'timestamp'])
        
        # Define the target variable: a download in the next 30 days
        df['future_download_within_30d'] = df.groupby('visitor_id')['download_flag'].transform(
            lambda x: x.shift(-1).rolling(window=30, min_periods=1).apply(
                lambda y: y.any(), raw=True).fillna(0).astype(int)
        )
        
        # Feature Engineering: Lagged variables and temporal markers
        df['lag_session_depth'] = df.groupby('visitor_id').cumcount().shift(1).fillna(0)
        
        df['lag_download_count'] = df.groupby('visitor_id')['download_flag'].cumsum().shift(1).fillna(0)
        
        # Temporal markers (e.g., day of week, hour of day)
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['hour_of_day'] = df['timestamp'].dt.hour
        
        # Event dummies (Requires event data to be loaded)
        # This part is complex. We'll simplify for this example.
        # Assuming you have a list of event dates, you can create a feature like:
        # events_df = pd.read_csv('combined_company_events.csv')
        # events_df['event_date'] = pd.to_datetime(events_df['event_date'])
        # for date in events_df['event_date']:
        #     df[f'event_proximity_{date.strftime("%Y%m%d")}'] = (df['timestamp'] - date).dt.days.abs()
        
        # Select features for the model
        self.features = ['lag_session_depth', 'lag_download_count', 'day_of_week', 'hour_of_day']
        
        # Filter for rows where we have a target variable
        df_final = df.dropna(subset=['future_download_within_30d']).copy()
        
        # One-hot encode categorical features if any
        df_final = pd.get_dummies(df_final, columns=['day_of_week', 'hour_of_day'], drop_first=True)
        self.features = [col for col in df_final.columns if col in self.features or re.match(r'day_of_week_|hour_of_day_', col)]
        
        # Only use sessions that are not the first session for a visitor, to avoid data leakage
        df_final = df_final[df_final['is_new_visitor_session'] == False]

        return df_final
    
    def train_and_evaluate_gbm(self, df_longitudinal):
        """
        Trains a Gradient Boosting Machine and evaluates its performance using
        time-series cross-validation.
        """
        if df_longitudinal.empty or len(self.features) == 0:
            return None, "Not enough data or features to train the model."

        X = df_longitudinal[self.features]
        y = df_longitudinal['future_download_within_30d']

        tscv = TimeSeriesSplit(n_splits=5)
        
        all_maes = []
        all_aucs = []
        
        final_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            # Check if training or testing data is empty, skip if so.
            if X_train.empty or X_test.empty:
                continue
                
            final_model.fit(X_train, y_train)
            
            y_pred_proba = final_model.predict_proba(X_test)[:, 1]
            y_pred = final_model.predict(X_test)
            
            mae = mean_absolute_error(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            all_maes.append(mae)
            all_aucs.append(auc)

        if not all_maes: # Check if any folds were successfully run
            return None, "Training failed, no valid cross-validation folds."
            
        self.model = final_model
        
        metrics = {
            'mean_mae': np.mean(all_maes),
            'mean_auc': np.mean(all_aucs),
            'last_trained_date': df_longitudinal['timestamp'].max()
        }
        
        return self.model, metrics
        
    def predict(self, new_data):
        """
        Makes a prediction using the trained model.
        """
        if self.model is None:
            return "Model not trained yet."
        
        # new_data needs to have the same features as the training data
        new_data_processed = pd.get_dummies(new_data, columns=['day_of_week', 'hour_of_day'], drop_first=True)
        missing_cols = set(self.features) - set(new_data_processed.columns)
        for c in missing_cols:
            new_data_processed[c] = 0
        new_data_processed = new_data_processed[self.features]
        
        return self.model.predict_proba(new_data_processed)[:, 1]

def generate_event_ai_analysis(df_event_impact_summary):
    if df_event_impact_summary.empty:
        return "No event impact data available to generate insights."

    analysis_lines = []
    analysis_lines.append("## AI Summary: Company A Event Impact Analysis")
    analysis_lines.append("---")
    analysis_lines.append("This section analyzes how key financial and regulatory events for Company A correlate with changes in investor behavior on the website.")

    # Sort by Event Date for chronological analysis
    df_event_impact_summary_sorted = df_event_impact_summary.sort_values(by='Event Date', ascending=True)

    # General observations
    significant_impact_events = df_event_impact_summary_sorted[
        (df_event_impact_summary_sorted['Change in Visits (%)'].abs() > 10) |
        (df_event_impact_summary_sorted['Change in Downloads (%)'].abs() > 10) |
        (df_event_impact_summary_sorted['Change in Session Depth (%)'].abs() > 10)
    ]

    if not significant_impact_events.empty:
        analysis_lines.append("\n### Key Observations on Event Impact:")
        analysis_lines.append("We observed several events with notable shifts in investor behavior:")
        for index, event in significant_impact_events.iterrows():
            event_date = event['Event Date'].strftime('%Y-%m-%d')
            event_title = event['Event Title']
            changes = []
            if pd.notna(event['Change in Visits (%)']):
                changes.append(f"visits changed by **{event['Change in Visits (%)']:.2f}%**")
            if pd.notna(event['Change in Downloads (%)']):
                changes.append(f"downloads changed by **{event['Change in Downloads (%)']:.2f}%**")
            if pd.notna(event['Change in Session Depth (%)']):
                changes.append(f"session depth changed by **{event['Change in Session Depth (%)']:.2f}%**")

            if changes:
                analysis_lines.append(f"- **{event_title} ({event_date})**: {' and '.join(changes)}.")
            else:
                analysis_lines.append(f"- **{event_title} ({event_date})**: No significant quantifiable changes observed for this event.")
    else:
        analysis_lines.append("\n### General Observations:")
        analysis_lines.append("No single event triggered a dramatic shift (more than 10% change) in visitor behavior metrics during their respective windows.")
        analysis_lines.append("This could mean investor behavior is consistently engaged, or the events had a more subtle, prolonged impact not captured by direct window comparison.")

    # Identify top positive/negative impacts
    most_impactful_visit_increase = df_event_impact_summary_sorted.nlargest(1, 'Change in Visits (%)', keep='first')
    most_impactful_visit_decrease = df_event_impact_summary_sorted.nsmallest(1, 'Change in Visits (%)', keep='first')
    
    most_impactful_download_increase = df_event_impact_summary_sorted.nlargest(1, 'Change in Downloads (%)', keep='first')
    most_impactful_download_decrease = df_event_impact_summary_sorted.nsmallest(1, 'Change in Downloads (%)', keep='first')

    analysis_lines.append("\n### Key Insights & Recommendations:")

    if not most_impactful_visit_increase.empty and most_impactful_visit_increase.iloc[0]['Change in Visits (%)'] > 0:
        event = most_impactful_visit_increase.iloc[0]
        analysis_lines.append(f"- The event **'{event['Event Title']}'** on {event['Event Date'].strftime('%Y-%m-%d')} led to the highest increase in **Total Visits** ({event['Change in Visits (%)']:.2f}%). This indicates strong public or investor interest surrounding this type of announcement. **Recommendation:** Analyze the specific content and communication strategy for this event and replicate successful elements for similar future announcements.")
    
    if not most_impactful_download_increase.empty and most_impactful_download_increase.iloc[0]['Change in Downloads (%)'] > 0:
        event = most_impactful_download_increase.iloc[0]
        analysis_lines.append(f"- The event **'{event['Event Title']}'** on {event['Event Date'].strftime('%Y-%m-%d')} significantly boosted **Downloads** ({event['Change in Downloads (%)']:.2f}%). This suggests the associated materials (e.g., reports, presentations) were highly relevant or effectively promoted. **Recommendation:** Ensure all relevant documents for high-impact events are prominently featured and easily accessible. Consider pre-event teasers for upcoming reports.")

    if not most_impactful_visit_decrease.empty and most_impactful_visit_decrease.iloc[0]['Change in Visits (%)'] < 0:
        event = most_impactful_visit_decrease.iloc[0]
        analysis_lines.append(f"- Conversely, the event **'{event['Event Title']}'** on {event['Event Date'].strftime('%Y-%m-%d')} saw the largest drop in **Total Visits** ({event['Change in Visits (%)']:.2f}%). This could indicate lower interest, saturation, or perhaps the event itself was less critical. **Recommendation:** Evaluate the type of information presented, its timing, and promotion for events that cause visitor declines to understand if adjustments are needed.")

    if not most_impactful_download_decrease.empty and most_impactful_download_decrease.iloc[0]['Change in Downloads (%)'] < 0:
        event = most_impactful_download_decrease.iloc[0]
        analysis_lines.append(f"- **Downloads** decreased most around **'{event['Event Title']}'** on {event['Event Date'].strftime('%Y-%m-%d')} ({event['Change in Downloads (%)']:.2f}%). This suggests the associated documents may have been less appealing or harder to find. **Recommendation:** Review the content, relevance, and discoverability of materials released around low-engagement events.")

    # General recommendations based on all data
    analysis_lines.append("\n### General Strategic Recommendations:")
    analysis_lines.append("- **Proactive Communication**: Leverage the insights from events that generated positive spikes in engagement. Plan pre-event communication, ensure clear messaging, and easy access to new content as soon as it's released.")
    analysis_lines.append("- **Content Relevance**: Continuously assess if the information provided around events truly meets the needs of your investor audience. Use download figures and session depth as indicators of content value.")
    analysis_lines.append("- **Multi-Channel Promotion**: Events are opportunities for multiple touchpoints. Ensure the website content is integrated with email alerts, social media announcements, and news releases to maximize reach.")
    analysis_lines.append("- **Segmented Analysis**: As a next step, combining this event data with the **Visitor Profiles (Clusters)** would allow for a deeper understanding of *which specific investor segments* are most responsive to different types of events. This would enable highly targeted communication strategies.")

    return "\n".join(analysis_lines)

# ---  generate_website_ai_analysis ---
def generate_website_ai_analysis(selected_website, metrics_filtered, global_benchmark_metrics, tag_explanations, combined_pageviews_for_ai, cluster_content_interest_df=None):
    ai_analysis = []
    if cluster_content_interest_df is not None and not cluster_content_interest_df.empty:
        ai_analysis.append("\n### Targeted Content & Audience Insights:")

        for cluster_name in cluster_content_interest_df['Cluster Name'].unique():
            cluster_data = cluster_content_interest_df[cluster_content_interest_df['Cluster Name'] == cluster_name].sort_values('Interest Score', ascending=False)
            if not cluster_data.empty:
                top_section = cluster_data.iloc[0]['Section']
                score = cluster_data.iloc[0]['Interest Score']
                ai_analysis.append(f"- **{cluster_name}**: shows highest interest in '{top_section}' (score: {score:.1f}).")

                if 'High-Intent' in cluster_name and 'Investor Relations' in top_section:
                    ai_analysis.append(f"  **Action**: Continue enriching {top_section} with detailed analytics and reports for these key users.")
                elif 'New Visitors' in cluster_name and 'Product' in top_section:
                    ai_analysis.append(f"  **Action**: Optimize the onboarding experience and clear CTAs on {top_section} to convert new visitors.")
    else:
        ai_analysis.append("\n### Targeted Content & Audience Insights: (No clustered content interest data for deeper insights)")
    # 1. Overall Performance Summary and Comparison
    current_avg_depth = metrics_filtered.get('avg_depth', 0.0)
    current_bounce_rate = metrics_filtered.get('bounce_rate', 0.0)
    current_total_downloads = metrics_filtered.get('total_downloads', 0)
    current_total_visits = metrics_filtered.get('total_visits', 0)
    current_downloads_per_visit = current_total_downloads / current_total_visits if current_total_visits > 0 else 0.0

    global_avg_depth = global_benchmark_metrics.get('avg_depth', 0.0)
    global_bounce_rate = global_benchmark_metrics.get('bounce_rate', 0.0)
    global_avg_downloads_per_visit = global_benchmark_metrics.get('avg_downloads_per_visit', 0.0)

    ai_analysis.append(f"## AI Analysis for {selected_website} Website")
    ai_analysis.append("---")
    ai_analysis.append("### Performance Overview:")
    ai_analysis.append(f"The website for **{selected_website}** currently records **{current_total_visits} visits**, with an average session depth of **{current_avg_depth:.2f} pages** and a bounce rate of **{current_bounce_rate:.2%}**.")
    ai_analysis.append(f"Users initiated **{current_total_downloads} downloads** on this site.")

    ai_analysis.append("\n### Key Comparisons:")
    if current_avg_depth > global_avg_depth:
        ai_analysis.append(f"- **Higher Session Depth ({current_avg_depth:.2f} vs Global Avg: {global_avg_depth:.2f})**: This indicates that visitors to {selected_website} are **more engaged** and explore deeper into the site than the overall average. This is a positive sign, suggesting relevant content or good internal linking.")
    else:
        ai_analysis.append(f"- **Lower Session Depth ({current_avg_depth:.2f} vs Global Avg: {global_avg_depth:.2f})**: Visitors to {selected_website} tend to explore less deeply. Consider optimizing content layout, calls-to-action, and internal linking to encourage further exploration.")

    if current_bounce_rate < global_bounce_rate:
        ai_analysis.append(f"- **Lower Bounce Rate ({current_bounce_rate:.2%} vs Global Avg: {global_bounce_rate:.2%})**: This is excellent! It suggests landing pages are highly effective, retaining visitors and encouraging them to proceed beyond the entry page.")
    else:
        ai_analysis.append(f"- **Higher Bounce Rate ({current_bounce_rate:.2%} vs Global Avg: {global_bounce_rate:.2%})**: A high bounce rate indicates that initial landing pages might not be meeting visitor expectations or engagement is low. Focus on improving first-page experience and clarity.")
    
    if current_downloads_per_visit > global_avg_downloads_per_visit:
        ai_analysis.append(f"- **Higher Download-per-Visit Ratio ({current_downloads_per_visit:.2f} vs Global Avg: {global_avg_downloads_per_visit:.2f})**: This website is highly effective at converting visits into tangible interest (downloads). Highlight key resources and ensure smooth download processes.")
    else:
        ai_analysis.append(f"- **Lower Download-per-Visit Ratio ({current_downloads_per_visit:.2f} vs Global Avg: {global_avg_downloads_per_visit:.2f})**: Consider improving the visibility, accessibility, or perceived value of downloadable content to boost conversion rates.")

    # 2. Content & Audience Insights
    ai_analysis.append("\n### Content and Audience Insights:")
    top_pageview_section = metrics_filtered['pageview'].index[0] if not metrics_filtered['pageview'].empty else 'N/A'
    if top_pageview_section != 'N/A':
        ai_analysis.append(f"- The most popular content area is **'{top_pageview_section}'**, indicating strong visitor interest here. Continue to invest in and promote content within this section.")
        
        # Compare pageview percentage for top section with overall
        website_top_section_pct = combined_pageviews_for_ai[(combined_pageviews_for_ai['section'] == top_pageview_section) & (combined_pageviews_for_ai['type'] == selected_website)]['percentage'].iloc[0] if not combined_pageviews_for_ai.empty and not combined_pageviews_for_ai[(combined_pageviews_for_ai['section'] == top_pageview_section) & (combined_pageviews_for_ai['type'] == selected_website)].empty else 0
        overall_top_section_pct = combined_pageviews_for_ai[(combined_pageviews_for_ai['section'] == top_pageview_section) & (combined_pageviews_for_ai['type'] == 'Overall')]['percentage'].iloc[0] if not combined_pageviews_for_ai.empty and not combined_pageviews_for_ai[(combined_pageviews_for_ai['section'] == top_pageview_section) & (combined_pageviews_for_ai['type'] == 'Overall')].empty else 0

        if website_top_section_pct > overall_top_section_pct:
            ai_analysis.append(f"  - This section is disproportionately popular ({website_top_section_pct:.2f}%) compared to its overall website average ({overall_top_section_pct:.2f}%), highlighting a unique strength or focus of {selected_website}.")

    if not metrics_filtered['tag_counts'].empty:
        top_tag = metrics_filtered['tag_counts'].index[0]
        ai_analysis.append(f"- A dominant segment of visitors are categorized as **'{top_tag}'**. This group likely drives significant interactions. ({tag_explanations.get(top_tag, 'No explanation available.').split('.')[0].strip()}).")
        
        # Suggest actions based on top tag
        if 'High-Intent' in top_tag:
            ai_analysis.append("  - **Action**: Prioritize personalized outreach or direct engagement strategies for these high-value users.")
        elif 'High Bounce Rate' in top_tag:
            ai_analysis.append("  - **Action**: Rework landing pages and initial content to better capture the attention of this segment and reduce immediate exits.")
        elif 'IR-Only Browser' in top_tag:
            ai_analysis.append("  - **Action**: Ensure IR content is easily accessible and consider cross-promoting related non-IR content subtly to broaden their engagement.")
    
    # 3. Actionable Recommendations
    ai_analysis.append("\n### Actionable Recommendations:")
    ai_analysis.append("- **Content Strategy**: Given the popular sections and visitor tags, consider developing more in-depth content or interactive tools within high-interest areas.")
    ai_analysis.append("- **User Experience**: Analyze the paths of high-bounce visitors. Are there broken links, slow loading times, or confusing navigation preventing deeper engagement?")
    ai_analysis.append("- **Conversion Optimization**: If download rates are lower than desired, A/B test different call-to-action placements, button designs, or resource descriptions to improve conversions.")
    ai_analysis.append("- **Personalization**: For repeat visitors or specific clusters, explore personalized content recommendations or targeted messaging to enhance their experience.")
    if current_bounce_rate > global_bounce_rate and not metrics_filtered['tag_counts'].empty and 'New Visitor' in metrics_filtered['tag_counts'].index[0]:
        ai_analysis.append("- **Targeted UX Improvement**: Given high overall bounce rate and a large 'New Visitor' segment, prioritize A/B testing of landing page designs and content clarity to immediately engage new users.")
    
    # Example: If download rate is low, but 'Frequent Downloader' tag exists
    if current_downloads_per_visit < global_avg_downloads_per_visit and not metrics_filtered['tag_counts'].empty and 'Frequent Downloader' in metrics_filtered['tag_counts'].index:
        ai_analysis.append("- **Content Accessibility**: Despite having frequent downloaders, the overall download ratio is low. This suggests a discoverability issue. Ensure top downloadable resources are prominently linked and easily navigable, especially for repeat visitors.")
    return "\n".join(ai_analysis)

# --- NEW HELPER FUNCTION: Full Cluster Descriptions for Modal/Bottom Section ---
def generate_cluster_full_description_content():
    content = []
    content.append(html.P("Here are the detailed profiles for each visitor segment identified by our clustering analysis. These personas can help you tailor content and engagement strategies.", className="mb-3"))
    for persona in cluster_personas_data:
        content.append(html.H5(f"Cluster {persona['num']}: {persona['name']}", className="mt-4"))
        content.append(dcc.Markdown(persona['full']))
        content.append(html.Hr())
    return html.Div(content)


# --- Dashboard Data Preparation (Keep as is) ---
def prepare_dashboard_data(file_path, sitemap_file_path):
    """Loads, preprocesses data, and calculates all initial metrics for the dashboards."""
    print("--- Preparing Dashboard Data ---")
    data_loader = DataLoader(file_path)
    df_all = data_loader.load_and_preprocess_data()
    
    if df_all.empty:
        print("Error: Processed DataFrame is empty. Cannot proceed with analysis.")

        return (pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), {}, {}, {}, 
                pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()) # Added two new empty DFs for clustered/overall behavior


    analyzer = BehaviorAnalyzer(df_all)
    

    pageview_metrics = analyzer.calculate_pageview_metrics()
    bounce_results = analyzer.calculate_bounce_rate()
    download_results = analyzer.analyze_download_behavior()
    funnel_results = analyzer.perform_funnel_analysis()
    avg_depth_results = analyzer.calculate_average_depth()
    unique_investors_results = analyzer.analyze_unique_investors()
    session_path_results = analyzer.analyze_session_paths()
    time_distribution_results = analyzer.analyze_time_distribution()
    session_depth_repeat_results = analyzer.analyze_session_path_length_and_repeat_visitors()
    new_vs_returning_results = analyzer.compare_new_vs_returning()
    country_comparison_results = analyzer.compare_by_country()
    common_paths_results = analyzer.analyze_most_common_paths()
    
    visitor_profiles_results = analyzer.generate_visitor_profiles()
    df_profiles = visitor_profiles_results['visitor_profiles_df'] 
    
    clustering_results = analyzer.perform_clustering_and_profiling(n_clusters=5) 
    if 'cluster' in clustering_results.get('visitor_profiles_with_clusters', pd.DataFrame()).columns:
        df_profiles = clustering_results['visitor_profiles_with_clusters'] 

    if 'gmm_comparison' in clustering_results.get('metrics_data', {}):
        gmm_data = clustering_results['metrics_data']['gmm_comparison']
        print("\n--- GMM and BIC Comparison Results ---")
        if pd.notna(gmm_data.get('best_gmm_n_components')):
            print(f"Best GMM model has {int(gmm_data['best_gmm_n_components'])} components with a BIC score of {gmm_data['best_gmm_bic']:.2f}")
        else:
            print("GMM comparison could not be performed or did not find a valid model.")

    if not df_profiles.empty:
        df_all_with_profiles = df_all.merge(df_profiles[['visitor_id', 'cluster', 'user_tags']], on='visitor_id', how='left')
        df_all_with_profiles['cluster'] = df_all_with_profiles['cluster'].fillna(-1).astype(int) # -1 for unclustered
    else:
        df_all_with_profiles = df_all.copy()
        df_all_with_profiles['cluster'] = -1 # Default cluster for all if no profiles
        df_all_with_profiles['user_tags'] = '' # Default empty tags

    if not df_profiles.empty:
        df_all_with_profiles = df_all.merge(df_profiles[['visitor_id', 'cluster', 'user_tags']], on='visitor_id', how='left')
        df_all_with_profiles['cluster'] = df_all_with_profiles['cluster'].fillna(-1).astype(int) # -1 for unclustered
    else:
        
        df_all_with_profiles = df_all.copy()
        df_all_with_profiles['cluster'] = -1 # Default cluster for all if no profiles
        df_all_with_profiles['user_tags'] = '' # Default empty tags


  
    df_events = pd.DataFrame()
    df_event_impact_summary = pd.DataFrame()

    try:
        df_events = pd.read_csv('combined_company_events.csv')
        df_events['event_date'] = pd.to_datetime(df_events['event_date'], errors='coerce')
        df_events.dropna(subset=['event_date'], inplace=True)
        df_events = df_events[df_events['company'] == 'Vodafone'].copy()
        print(f"Loaded {len(df_events)} Vodafone events.")
    except FileNotFoundError:
        print("Warning: 'combined_company_events.csv' not found. Event data will not be available.")
    except Exception as e:
        print(f"Error loading combined_company_events.csv: {e}")

    try:
        df_event_impact_summary = pd.read_csv('company_a_event_impact_summary.csv')
        df_event_impact_summary['Event Date'] = pd.to_datetime(df_event_impact_summary['Event Date'], errors='coerce')
        df_event_impact_summary.dropna(subset=['Event Date'], inplace=True)
        print(f"Loaded {len(df_event_impact_summary)} event impact summaries.")
    except FileNotFoundError:
        print("Warning: 'company_a_event_impact_summary.csv' not found. Event impact summary will not be available.")
    except Exception as e:
        print(f"Error loading company_a_event_impact_summary.csv: {e}")


    df_daily_company_a_behavior_clustered = pd.DataFrame() # Initialize
    df_daily_company_a_behavior_overall = pd.DataFrame() # Initialize

    if not df_all_with_profiles.empty: 
        df_company_a_behavior_raw_with_profiles = df_all_with_profiles[df_all_with_profiles['website_company'] == 'Company A'].copy()
        if not df_company_a_behavior_raw_with_profiles.empty:
            df_daily_company_a_behavior_clustered = df_company_a_behavior_raw_with_profiles.groupby([
                df_company_a_behavior_raw_with_profiles['timestamp'].dt.date, 'cluster'
            ]).agg(
                total_visits=('id_visit', 'nunique'),
                total_pageviews=('id_visit', 'count'),
                total_downloads=('download_flag', lambda x: x.sum() if pd.api.types.is_bool_dtype(x) else (x == True).sum()),
                avg_session_depth=('id_visit', lambda x: x.count() / x.nunique() if x.nunique() > 0 else 0)
            ).reset_index()
            df_daily_company_a_behavior_clustered.rename(columns={'timestamp': 'date'}, inplace=True)
            df_daily_company_a_behavior_clustered['date'] = pd.to_datetime(df_daily_company_a_behavior_clustered['date'])
            print(f"Prepared daily Company A behavior data with clusters: {len(df_daily_company_a_behavior_clustered)} rows.")
            
            df_daily_company_a_behavior_overall = df_company_a_behavior_raw_with_profiles.groupby(df_company_a_behavior_raw_with_profiles['timestamp'].dt.date).agg(
                total_visits=('id_visit', 'nunique'),
                total_pageviews=('id_visit', 'count'),
                total_downloads=('download_flag', lambda x: x.sum() if pd.api.types.is_bool_dtype(x) else (x == True).sum()),
                avg_session_depth=('id_visit', lambda x: x.count() / x.nunique() if x.nunique() > 0 else 0)
            ).reset_index()
            df_daily_company_a_behavior_overall.rename(columns={'timestamp': 'date'}, inplace=True)
            df_daily_company_a_behavior_overall['date'] = pd.to_datetime(df_daily_company_a_behavior_overall['date'])
            
        else:
            print("No Company A behavior data found in df_all_with_profiles for daily aggregation.")
            df_daily_company_a_behavior_overall = pd.DataFrame(columns=['date', 'total_visits', 'total_pageviews', 'total_downloads', 'avg_session_depth'])
            df_daily_company_a_behavior_clustered = pd.DataFrame(columns=['date', 'cluster', 'total_visits', 'total_pageviews', 'total_downloads', 'avg_session_depth'])
    else:
        df_daily_company_a_behavior_overall = pd.DataFrame(columns=['date', 'total_visits', 'total_pageviews', 'total_downloads', 'avg_session_depth'])
        df_daily_vodafone_behavior_clustered = pd.DataFrame(columns=['date', 'cluster', 'total_visits', 'total_pageviews', 'total_downloads', 'avg_session_depth'])


    # Consolidate all detailed metrics into a single dictionary
    detailed_metrics = {
        'pageview_metrics': pageview_metrics['metrics_data'],
        'bounce_results': bounce_results['metrics_data'],
        'download_results': download_results['metrics_data'],
        'funnel_results': funnel_results['metrics_data'],
        'avg_depth_results': avg_depth_results['metrics_data'],
        'unique_investors_results': unique_investors_results['metrics_data'],
        'session_path_results': session_path_results['metrics_data'],
        'time_distribution_results': time_distribution_results['metrics_data'],
        'session_depth_repeat_results': session_depth_repeat_results['metrics_data'],
        'new_vs_returning_results': new_vs_returning_results['metrics_data'],
        'country_comparison_results': country_comparison_results['metrics_data'],
        'common_paths_results': common_paths_results['metrics_data'],
        'visitor_profiles_data': visitor_profiles_results['metrics_data'],
        'clustering_results': clustering_results['metrics_data'],
    }
    
    cluster_descriptions = clustering_results.get('cluster_descriptions', {})
    
    overall_visitor_averages = {}
    if not df_profiles.empty:
        relevant_profile_features = [f for f in BehaviorAnalyzer.CLUSTERING_FEATURES if f in df_profiles.columns]
        overall_visitor_averages = df_profiles[relevant_profile_features].mean().to_dict()
    else:
        overall_visitor_averages = {feature: 0.0 for feature in BehaviorAnalyzer.CLUSTERING_FEATURES}
            
    sitemap_df = pd.DataFrame()
    try:
        with open(sitemap_file_path, 'r') as f:
            sitemap_data = json.load(f)
        sitemap_list = []
        for section, subsections in sitemap_data.items():
            for subsection in subsections:
                sitemap_list.append({'section': section, 'subsection': subsection})
        sitemap_df = pd.DataFrame(sitemap_list)
    except FileNotFoundError:
        print(f"Warning: Sitemap file not found at {sitemap_file_path}. Sitemap visualization will be based on raw data.")
    except Exception as e:
        print(f"Error loading sitemap JSON: {e}")
        sitemap_df = pd.DataFrame()

    print("--- Dashboard Data Preparation Complete ---")
    return (df_all, df_all_with_profiles, df_profiles, detailed_metrics, cluster_descriptions, 
            overall_visitor_averages, sitemap_df, df_events, df_event_impact_summary, 
            df_daily_vodafone_behavior_clustered, df_daily_vodafone_behavior_overall)

    

def open_browser_after_startup():
    """Opens the Dash app in the default web browser after a short delay."""
    time.sleep(1.5) # Give the server a moment to start
    url = "http://127.0.0.1:8050/"
    try:
        webbrowser.open_new_tab(url)
        print(f"Dash app has been opened in the browser: {url}")
    except Exception as e:
        print(f"Unable to open browser automatically, please visit manually: {url}")
        print(f"Error message: {e}")

# --- Main Program Logic and Dash App Initialization ---

data_file_path = 'all_traffic_data_merged.csv'
sitemap_framework_path = 'universal_website_category_framework.json'

# Prepare all data and metrics once at startup
df_processed, df_all_with_profiles, df_profiles, overall_metrics, cluster_descriptions, overall_visitor_averages, sitemap_df, df_events, df_event_impact_summary, df_daily_vodafone_behavior_clustered, df_daily_vodafone_behavior_overall = \
    prepare_dashboard_data(data_file_path, sitemap_framework_path)

# Extract unique companies and website companies for dropdowns
companies = sorted(df_processed['company'].unique().tolist()) if 'company' in df_processed.columns and not df_processed.empty else ['All Companies']
website_companies = sorted(df_processed['website_company'].unique().tolist()) if 'website_company' in df_processed.columns and not df_processed.empty else ['All Websites']

# Get all unique user tags for display and tooltips
all_tags = set(tag.strip() for tags_str in df_profiles['user_tags'].unique() for tag in tags_str.split(',') if tag) if not df_profiles.empty else set()

# Explanations for user tags (translated to English for this response)
tag_explanations = {
    'IR-Only Browser': 'Only visits Investor Relations (IR) content, indicating a focus on company financials and strategy.',
    'Frequent Downloader': 'Downloads multiple files, showing a high demand for downloadable resources like reports and whitepapers.',
    'Deep Path Visitor': 'Average session depth is over 5 pages, indicating in-depth research and high engagement.',
    'ESG-Focused': 'Visited ESG (Environmental, Social, and Governance) related pages, showing interest in sustainable investing.',
    'High Bounce Rate': 'Average session depth is close to 1, often leaving after visiting just one page.',
    'High-Intent': 'Exhibits both download behavior and IR content visits, suggesting a high-intent investor or partner.',
    'Repeat Visitor': 'Visited the website multiple times, indicating continued interest or a specific need.',
    'New Visitor': 'This is their first recorded session on the website.' # Added explanation for New Visitor
}

# ---  Cluster Glossary/Personas ---
# Pre-process cluster persona data for display (now includes cluster number in name)
cluster_personas_data = []
for cluster_id_str, description in cluster_descriptions.items():
    # cluster_id_str is like "Cluster 0", "Cluster 1"
    cluster_num = int(cluster_id_str.split(' ')[1])
    
    # Extract the concise persona name (e.g., "Highly Engaged Core Investors")
    match = re.match(r'\*\*(.*?):\*\*', description)
    persona_name = match.group(1).strip() if match else f"Cluster {cluster_num}"
    
    # Take the first sentence or a very short summary for the brief overview
    brief_description = description.split('.')[0] + '.' if '.' in description else description
    
    cluster_personas_data.append({
        'id': cluster_id_str, # e.g., "Cluster 0"
        'num': cluster_num,    # e.g., 0
        'name': persona_name,
        'brief': brief_description,
        'full': description
    })
# Sort by cluster number to ensure consistent order in display
cluster_personas_data = sorted(cluster_personas_data, key=lambda x: x['num'])


# Dash App Instance
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.CERULEAN])

# --- Optimized App Layout ---
# Crucial change: Buttons that are 'Input' in callbacks must be STATICALLY in the layout.
# Their visibility can be controlled by another callback.
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("Investor Behavior Intelligence", className="text-center text-primary my-4"))),
    html.Hr(className="my-2"),

    # === Static Buttons for Modals (Controlled by visibility callbacks) ===
    # These buttons must be in the initial layout for callbacks to find them.
    # We will control their 'display' style based on the active tab.
    # Placed globally so they are always in the DOM for callbacks.
    html.Div([
        dbc.Button(
            "Tag Glossary (Detailed Explanations)",
            id="open-tag-glossary-modal-website",
            size="sm",
            className="mb-3",
            style={'display': 'none'} # Initially hidden, visibility controlled by callback
        ),
        dbc.Button(
            "What are these clusters? (Detailed Glossary)",
            id="open-cluster-glossary-modal-website-tab",
            size="sm",
            className="mb-3 ms-2", # Add margin for separation
            style={'display': 'none'} # Initially hidden, visibility controlled by callback
        ),
        dbc.Button(
            "View Detailed Personas", # For visitor tab's cluster overview
            id="open-cluster-glossary-modal-visitor-tab",
            size="sm",
            className="mb-3 ms-2", # Add margin for separation
            style={'display': 'none'} # Initially hidden, visibility controlled by callback
        ),
    ], className="text-end mb-4"), # Container for global buttons, aligned right


    dbc.Tabs(id="tabs-main", active_tab="tab-website-company", children=[
        # --- Website Company-Level Dashboard ---
        dbc.Tab(label="Website Company Insights", tab_id="tab-website-company", children=[
            dbc.Row([
                dbc.Col(html.Div(id='website-executive-summary-container'), md=8),
                dbc.Col(dcc.Dropdown(
                    id='website-company-dropdown',
                    options=[{'label': 'All Websites', 'value': 'All'}] + [{'label': wc, 'value': wc} for wc in website_companies],
                    value='All',
                    clearable=False,
                    className="mt-4"
                ), md=4)
            ], align="center"),

            # website-company-dashboard-content will be the MAIN output for this tab's dynamic content
            html.Div(id='website-company-dashboard-content'),

            # This div will contain the DataTable for Tag Behavior, no button here directly
            # The button for tag glossary is now global (id="open-tag-glossary-modal-website")
            dbc.Row([
                dbc.Col(html.Hr(className="my-4")),
                dbc.Col(html.H3("Visitor Tag Group Behavior Comparison", className="mb-3")),
                dbc.Col(html.Div(id='tag-behavior-table-container'), width=12),
            ], className="mt-5 mb-4"),

            # Cluster Persona Description is now only via modal, triggered by global button
            # Removed the direct display container for cluster description here.
        ]),

        # --- Visitor Company-Level Dashboard ---
        dbc.Tab(label="Visitor Company Insights", tab_id="tab-visitor-company", children=[
            dbc.Row([
                dbc.Col(html.H2("Overall Website Performance & Visitor Profiling", className="my-4")),
                dbc.Col(dcc.Dropdown(
                    id='visitor-company-dropdown',
                    options=[{'label': 'All Companies', 'value': 'All'}] + [{'label': c, 'value': c} for c in companies],
                    value='All',
                    clearable=False,
                    className="mt-4"
                ), md=4)
            ], align="center"),

            html.Div(id='visitor-company-dashboard-content'), # Main content updated by callback

            # --- Cluster Overview moved to bottom of this tab for consistency ---
            # Now fully removed the compact cluster overview here as requested, only the button remains global.
            dbc.Row([
                dbc.Col(html.Hr(className="my-4")),
                dbc.Col(html.H3("Understand Visitor Segments and Their Personas", className="mb-3")),
                # The button to view detailed personas is now global (id="open-cluster-glossary-modal-visitor-tab")
                # This dbc.Card containing concise overview is now moved inside the visitor_dashboard_content callback IF needed.
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Clustering Diagnostics: Elbow Method (Distortion)"),
                    dbc.CardBody(dcc.Graph(id='elbow-method-chart'))
                ]), md=6, className="mb-4"),
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Clustering Diagnostics: Silhouette Scores"),
                    dbc.CardBody(dcc.Graph(id='silhouette-score-chart'))
                ]), md=6, className="mb-4"),
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([ # This card now represents the concise overview
                    dbc.CardHeader("Visitor Cluster Overview (Concise)"),
                    dbc.CardBody([
                        html.P("Each cluster represents a distinct group of visitors based on their browse and download behaviors. Hover over each for a brief description."),
                        dbc.Row(
                            [dbc.Col(dbc.Card(
                                [
                                    dbc.CardHeader(f"Cluster {p['num']}: {p['name']}"),
                                    dbc.CardBody(html.P(p['brief'], className="card-text"), id=f"cluster-tooltip-target-{p['id']}"),
                                    dbc.Tooltip(dcc.Markdown(p['full']), target=f"cluster-tooltip-target-{p['id']}", placement="top")
                                ], className="mb-2"
                            ), md=int(12/len(cluster_personas_data))) for p in cluster_personas_data]
                        ) if cluster_personas_data else html.P("No cluster data available.")
                    ])
                ], className="mb-4")),
            ], className="mt-5 mb-4"),
        ]),

        # --- Individual Visitor Dashboard ---
        dbc.Tab(label="Individual Visitor Analysis", tab_id="tab-visitor", children=[
            dbc.Row([
                dbc.Col(html.H2("Deep Dive into Individual Visitor Behavior", className="my-4")),
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Visitor Filtering & Search", className="bg-primary text-white"),
                    dbc.CardBody([
                        html.P("Use the table below to filter, sort, or search for a specific visitor.", className="text-muted"),
                        dash_table.DataTable(
                            id='visitor-table',
                            columns=[
                                {"name": "Visitor ID", "id": "visitor_id", "deletable": False, "selectable": True},
                                {"name": "Company", "id": "company", "deletable": False, "selectable": True, "type": "text"},
                                {"name": "Avg Session Depth", "id": "avg_session_depth", "deletable": False, "selectable": True, "type": "numeric"},
                                {"name": "Download Count", "id": "download_count", "deletable": False, "selectable": True, "type": "numeric"},
                                {"name": "Visit Count", "id": "visit_count", "deletable": False, "selectable": True, "type": "numeric"},
                                {"name": "IR Interest Score", "id": "investor_interest_score", "deletable": False, "selectable": True, "type": "numeric"},
                                {"name": "User Tags", "id": "user_tags", "deletable": False, "selectable": True, "type": "text"}
                            ],
                            data=df_profiles.to_dict('records') if not df_profiles.empty else [],
                            style_table={'overflowX': 'auto'},
                            page_action="native",
                            page_current=0,
                            page_size=15,
                            sort_action="native",
                            sort_mode="multi",
                            filter_action="native",
                            row_selectable="single",
                            style_header={'backgroundColor': 'rgb(230, 230, 230)', 'fontWeight': 'bold'},
                            style_cell={'textAlign': 'left', 'minWidth': '100px', 'width': '100px', 'maxWidth': '180px'},
                        )
                    ])
                ]), width=12, className="mb-4")
            ]),
            html.Div(id='individual-visitor-content')
        ]),

        # --- Event Impact Analysis Dashboard ---
        dbc.Tab(label="Event Impact Analysis", tab_id="tab-event-impact", children=[
            dbc.Row([
                dbc.Col(html.H2("Company A Event Impact on Investor Behavior", className="my-4")),
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("AI Generated Event Impact Summary"),
                    dbc.CardBody(dcc.Markdown(id='event-impact-ai-summary', className="ai-summary-text"))
                ]), width=12, className="mb-4")
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Company A Daily Total Visits Over Time with Financial Events"),
                    dbc.CardBody(dcc.Graph(id='company_a-visits-time-series'))
                ]), md=6, className="mb-4"),
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Company A Daily Total Downloads Over Time with Financial Events"),
                    dbc.CardBody(dcc.Graph(id='company_a-downloads-time-series'))
                ]), md=6, className="mb-4")
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Company A Event Impact Summary Table"),
                    dbc.CardBody(dash_table.DataTable(
                        id='company_a-event-summary-table',
                        columns=[{"name": i, "id": i} for i in df_event_impact_summary.columns] if not df_event_impact_summary.empty else [],
                        data=df_event_impact_summary.to_dict('records') if not df_event_impact_summary.empty else [],
                        style_table={'overflowX': 'auto'},
                        page_action="native",
                        page_current=0,
                        page_size=10,
                        sort_action="native",
                        sort_mode="multi",
                        style_header={'backgroundColor': 'rgb(230, 230, 230)', 'fontWeight': 'bold'},
                        style_cell={'textAlign': 'left', 'minWidth': '100px', 'width': '100px', 'maxWidth': '180px'},
                    ))
                ]), width=12, className="mb-4")
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Investor Cluster Response to Financial Events (Change in Visits %)"),
                    dbc.CardBody(dcc.Graph(id='cluster-event-impact-heatmap'))
                ]), width=12, className="mb-4")
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Detailed Cluster Event Impact Data"),
                    dbc.CardBody(id='cluster-event-impact-table')
                ]), width=12, className="mb-4")
            ]),
            dbc.Row([
                dbc.Col(dbc.Card([
                    dbc.CardHeader("Top Content Sections by Investor Cluster Interest"),
                    dbc.CardBody(dcc.Graph(id='cluster-content-interest-chart'))
                ]), width=12, className="mb-4")
            ]),
        ]) # Closes the last dbc.Tab
    ]), # Closes the children list of dbc.Tabs AND the dbc.Tabs component itself.

    ## --- NEW FEATURE: Glossary Modals (Global, outside tabs) ---
    # Tag Glossary Modal
    dbc.Modal(
        [
            dbc.ModalHeader("Glossary of Visitor Tags"),
            dbc.ModalBody(id="tag-glossary-modal-body"),
            dbc.ModalFooter(
                dbc.Button("Close", id="close-tag-glossary-modal", className="ms-auto")
            ),
        ],
        id="tag-glossary-modal",
        size="lg",
        centered=True,
        scrollable=True,
    ),

    # Cluster Glossary Modal
    dbc.Modal(
        [
            dbc.ModalHeader("Detailed Cluster Personas"),
            dbc.ModalBody(id="cluster-glossary-modal-body"),
            dbc.ModalFooter(
                dbc.Button("Close", id="close-cluster-glossary-modal", className="ms-auto")
            ),
        ],
        id="cluster-glossary-modal",
        size="lg",
        centered=True,
        scrollable=True,
    ),

], fluid=True)

## --- NEW HELPER FUNCTION: Executive Summary for Website Insights ---
def generate_website_executive_summary_content(selected_website, metrics_filtered, global_benchmark_metrics):
    summary_lines = []

    # Current Performance snapshot
    total_visits = metrics_filtered.get('total_visits', 0)
    avg_depth = metrics_filtered.get('avg_depth', 0.0)
    bounce_rate = metrics_filtered.get('bounce_rate', 0.0)
    total_downloads = metrics_filtered.get('total_downloads', 0)

    summary_lines.append(f"**Key Performance for {selected_website}:**")
    summary_lines.append(f"- **{total_visits}** total visits; users view **{avg_depth:.2f} pages** per session.")
    summary_lines.append(f"- **{bounce_rate:.2%}** bounce rate indicates a need to engage first-page visitors.")
    summary_lines.append(f"- Recorded **{total_downloads} downloads**, showing explicit content interest.")

    # Comparisons with Overall/Benchmark
    bench_avg_depth = global_benchmark_metrics.get('avg_depth', 0.0)
    bench_bounce_rate = global_benchmark_metrics.get('bounce_rate', 0.0)
    bench_downloads_per_visit = global_benchmark_metrics.get('avg_downloads_per_visit', 0.0)
    current_downloads_per_visit = total_downloads / total_visits if total_visits > 0 else 0

    summary_lines.append("\n**Compared to Overall Averages:**")
    if avg_depth > bench_avg_depth:
        summary_lines.append(f"- **Higher Session Depth ({avg_depth:.2f} vs Global Avg: {bench_avg_depth:.2f} pages)**: This website keeps users more engaged per session.")
    else:
        summary_lines.append(f"- **Lower Session Depth ({avg_depth:.2f} vs Global Avg: {bench_avg_depth:.2f} pages)**: Consider internal linking and richer content to encourage deeper exploration.")

    if bounce_rate < bench_bounce_rate:
        summary_lines.append(f"- **Lower Bounce Rate ({bounce_rate:.2%} vs Global Avg: {bench_bounce_rate:.2%})**: This is excellent! It suggests landing pages are highly effective, retaining visitors and encouraging them to proceed beyond the entry page.")
    else:
        summary_lines.append(f"- **Higher Bounce Rate ({bounce_rate:.2%} vs Global Avg: {bench_bounce_rate:.2%})**: Optimize initial page content and load times for new visitors.")

    if current_downloads_per_visit > bench_downloads_per_visit:
        summary_lines.append(f"- **Higher Download-per-Visit Ratio ({current_downloads_per_visit:.2f} vs Global Avg: {bench_downloads_per_visit:.2f} downloads/visit)**: High-value content is effectively driving conversions.")
    else:
        summary_lines.append(f"- **Lower Download-per-Visit Ratio ({current_downloads_per_visit:.2f} vs Global Avg: {bench_downloads_per_visit:.2f} downloads/visit)**: Improve visibility or appeal of downloadable resources.")

    # Top Content and Audience Insight
    top_pageview_section = metrics_filtered['pageview'].index[0] if not metrics_filtered['pageview'].empty else 'N/A'
    if top_pageview_section != 'N/A':
        summary_lines.append(f"\n**Top Content & Audience:**")
        summary_lines.append(f"- **'{top_pageview_section}'** is the most visited section, a key area for engagement.")
    
    if not metrics_filtered['tag_counts'].empty:
        top_tag = metrics_filtered['tag_counts'].index[0]
        if top_tag in tag_explanations:
            summary_lines.append(f"- A significant portion of users are **'{top_tag}'** (meaning: {tag_explanations[top_tag].split(',')[0].strip()}). Tailor content or CTAs for this audience.")


    summary_lines.append("\n**Actionable Insight:** Focus on optimizing the **user journey from landing pages to deeper content** to reduce bounce rate and increase session depth. For **downloadable assets**, ensure their visibility and relevance are maximized.")

    return dbc.Card([
        dbc.CardHeader("Executive Summary: Quick Insights", className="bg-info text-white"),
        dbc.CardBody(dcc.Markdown("\n".join(summary_lines), className="ai-summary-text")),
    ], className="mb-4")

# --- Callback for Website Company Insights Dashboard ---
@app.callback(
    # Outputting to three distinct components:
    Output('website-company-dashboard-content', 'children'),
    Output('website-executive-summary-container', 'children'), 
    Output('tag-behavior-table-container', 'children'), 
    Input('website-company-dropdown', 'value')
)
def update_website_company_dashboard(selected_website):
    df_filtered_website = df_all_with_profiles if selected_website == 'All' else df_all_with_profiles[df_all_with_profiles['website_company'] == selected_website]
    
    if df_filtered_website.empty:
        return (
            dbc.Alert(f"No data available for website: {selected_website}", color="warning"), 
            html.Div(), # Return an empty div for Executive Summary
            html.P("No data for tag behavior table.") # Return text for tag table
        )

    analyzer_filtered_website = BehaviorAnalyzer(df_filtered_website)

    cluster_content_interest_df = analyzer_filtered_website.analyze_content_interest_by_cluster()
    
    # Map Cluster ID to Cluster Name for readability in AI summary and potential future display
    global cluster_personas_data # Access the global cluster personas data
    if not cluster_content_interest_df.empty and cluster_personas_data:
        cluster_id_to_name_map = {p['num']: p['name'] for p in cluster_personas_data}
        cluster_content_interest_df['Cluster Name'] = cluster_content_interest_df['Cluster ID'].map(cluster_id_to_name_map).fillna('Unclustered')
    else:
        # If no cluster data or personas, ensure it's an empty DataFrame with expected columns if needed downstream
        cluster_content_interest_df = pd.DataFrame(columns=['Cluster ID', 'Section', 'Interest Score', 'Cluster Name'])

    website_benchmark_metrics = {}
    for wc in website_companies:
        df_temp = df_all_with_profiles[df_all_with_profiles['website_company'] == wc]
        if df_temp.empty:
            continue
        temp_analyzer = BehaviorAnalyzer(df_temp)
        visits = df_temp['id_visit'].nunique()
        downloads = df_temp['download_flag'].sum()
        temp_bounce_rates = temp_analyzer.calculate_bounce_rate()['metrics_data']['overall_bounce_rate']
        temp_avg_depth = temp_analyzer.calculate_average_depth()['metrics_data']['avg_depth_per_visit']

        website_benchmark_metrics[wc] = {
            'avg_depth': temp_avg_depth,
            'bounce_rate': temp_bounce_rates,
            'total_downloads': downloads,
            'visits': visits
        }

    global_analyzer = BehaviorAnalyzer(df_processed)  
    global_avg_depth_results = global_analyzer.calculate_average_depth()['metrics_data']
    global_bounce_results = global_analyzer.calculate_bounce_rate()['metrics_data']

    global_avg_downloads_per_visit_for_ai_text = df_processed['download_flag'].sum() / df_processed['id_visit'].nunique() if df_processed['id_visit'].nunique() > 0 else 0.0

    all_website_total_downloads_list = [m['total_downloads'] for m in website_benchmark_metrics.values()]
    global_bench_downloads_for_kpi = np.mean(all_website_total_downloads_list) if all_website_total_downloads_list else 0.0

    global_bench_avg_depth = global_avg_depth_results['avg_depth_per_visit']
    global_bench_bounce_rate = global_bounce_results['overall_bounce_rate']

    filtered_visitor_ids = df_filtered_website['visitor_id'].unique()
    df_profiles_filtered_website = df_profiles[df_profiles['visitor_id'].isin(filtered_visitor_ids)]
    
    metrics_filtered_website = {
        'pageview': analyzer_filtered_website.df['section'].value_counts(),
        'time': analyzer_filtered_website.df['timestamp'].dt.hour.value_counts().sort_index(),
        'bounce_rate': analyzer_filtered_website.calculate_bounce_rate()['metrics_data']['overall_bounce_rate'],
        'avg_depth': analyzer_filtered_website.calculate_average_depth()['metrics_data']['avg_depth_per_visit'],
        'total_visits': df_filtered_website['id_visit'].nunique(),
        'unique_visitors': df_filtered_website['visitor_id'].nunique(),
        'total_downloads': df_filtered_website['download_flag'].sum(),
        'new_vs_returning_comp': analyzer_filtered_website.compare_new_vs_returning()['metrics_data'],
        'tag_counts': df_profiles_filtered_website['user_tags'].str.split(',').explode().str.strip().value_counts() if not df_profiles_filtered_website.empty else pd.Series()
    }
    
    company_pageviews_pct = (metrics_filtered_website['pageview'] / metrics_filtered_website['pageview'].sum() * 100).reset_index(name='percentage') if not metrics_filtered_website['pageview'].empty and metrics_filtered_website['pageview'].sum() > 0 else pd.DataFrame(columns=['section', 'percentage'])
    company_pageviews_pct.rename(columns={'index': 'section'}, inplace=True)
    company_pageviews_pct['type'] = selected_website
    
    overall_pageviews_pct = (df_processed['section'].value_counts() / len(df_processed) * 100).reset_index(name='percentage') if not df_processed['section'].empty and len(df_processed) > 0 else pd.DataFrame(columns=['section', 'percentage'])
    overall_pageviews_pct.rename(columns={'index': 'section'}, inplace=True)
    overall_pageviews_pct['type'] = 'Overall'
    
    combined_pageviews_for_ai = pd.concat([company_pageviews_pct, overall_pageviews_pct], ignore_index=True)

    ai_analysis_text = generate_website_ai_analysis(
        selected_website,
        metrics_filtered_website,
        {'avg_depth': global_bench_avg_depth,
         'bounce_rate': global_bench_bounce_rate,
         'avg_downloads_per_visit': global_avg_downloads_per_visit_for_ai_text 
        },
        tag_explanations,
        combined_pageviews_for_ai,
        cluster_content_interest_df=cluster_content_interest_df
    )

    tag_behavior_data = []
    for tag in all_tags:
        visitors_with_tag_profiles = df_profiles_filtered_website[df_profiles_filtered_website['user_tags'].str.contains(tag, na=False)]
        if not visitors_with_tag_profiles.empty:
            visitors_with_tag_ids = visitors_with_tag_profiles['visitor_id'].unique()
            df_tag_filtered = df_filtered_website[df_filtered_website['visitor_id'].isin(visitors_with_tag_ids)]
            
            if not df_tag_filtered.empty:
                tag_analyzer = BehaviorAnalyzer(df_tag_filtered)
                
                tag_avg_depth = tag_analyzer.calculate_average_depth()['metrics_data']['avg_depth_per_visit']
                tag_bounce_rate = tag_analyzer.calculate_bounce_rate()['metrics_data']['overall_bounce_rate']
                tag_total_downloads = tag_analyzer.df['download_flag'].sum()  
                tag_unique_visitors = visitors_with_tag_profiles['visitor_id'].nunique()    

                tag_behavior_data.append({
                    'Tag': tag,
                    'Unique Visitors': tag_unique_visitors,
                    'Avg Session Depth': round(tag_avg_depth, 2),
                    'Bounce Rate': f"{tag_bounce_rate:.2%}",
                    'Total Downloads': int(tag_total_downloads)
                })

    df_tag_behavior = pd.DataFrame(tag_behavior_data)
    if not df_tag_behavior.empty:
        df_tag_behavior = df_tag_behavior.sort_values(by='Avg Session Depth', ascending=False)


    tag_comparison_table_content = dash_table.DataTable(
        id='tag-behavior-table', # Keep ID for potential future interactivity
        columns=[{"name": col, "id": col} for col in df_tag_behavior.columns],
        data=df_tag_behavior.to_dict('records'),
        style_table={'overflowX': 'auto'},
        page_action="native",
        page_current=0,
        page_size=10,  
        sort_action="native",
        sort_mode="multi",  
        style_header={'backgroundColor': 'rgb(230, 230, 230)', 'fontWeight': 'bold'},
        style_cell={'textAlign': 'left', 'minWidth': '120px', 'width': '120px', 'maxWidth': '180px'},
    ) if not df_tag_behavior.empty else html.P("No sufficient tag behavior data available for analysis for the selected website.")


    # --- Dashboard Layout Structure for Website Company Insights ---
    # This is the content returned for 'website-company-dashboard-content'
    # It contains all the KPIs and plots.
    website_dashboard_content = html.Div([
        # 1. KPI Card Layout with Benchmarks
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Total Visits"),
                dbc.CardBody(html.H4(f"{metrics_filtered_website['total_visits']}", className="text-center text-secondary")),
                dbc.CardFooter(f"Total sessions for {selected_website}", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Avg Session Depth"),
                dbc.CardBody([
                    html.H4(f"{metrics_filtered_website['avg_depth']:.2f} Pages", className="text-center text-info"),
                    html.P(f"(Benchmark Avg: {global_bench_avg_depth:.2f})", className="text-center text-muted") if selected_website != 'All' else None
                ]),
                dbc.CardFooter("Avg pages visited per session", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Bounce Rate"),
                dbc.CardBody([
                    html.H4(f"{metrics_filtered_website['bounce_rate']:.2%}", className="text-center text-danger"),
                    html.P(f"(Benchmark Avg: {global_bench_bounce_rate:.2%})", className="text-center text-muted") if selected_website != 'All' else None
                ]),
                dbc.CardFooter("Single-page visit percentage", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Downloads"),
                dbc.CardBody([
                    html.H4(f"{int(metrics_filtered_website['total_downloads'])}", className="text-center text-success"),
                    html.P(f"(Benchmark Avg: {global_bench_downloads_for_kpi:.1f})", className="text-center text-muted") if selected_website != 'All' else None
                ]),
                dbc.CardFooter("Total file downloads", className="text-muted")
            ], className="h-100"), md=3),
        ], className="mb-4"),
        # NEW: AI Analysis Card
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("AI Website Insight Analysis", className="bg-primary text-white"),
                dbc.CardBody(dcc.Markdown(ai_analysis_text, className="ai-analysis-text")),
            ]), width=12, className="mb-4")
        ]),
        # Row 1: Pageviews Comparison & Time Distribution
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Pageview Percentage Comparison (Website vs. Overall)"),
                dbc.CardBody(dcc.Graph(figure=px.bar(
                    combined_pageviews_for_ai, 
                    x='section', 
                    y='percentage',
                    color='type',
                    barmode='group',
                    title="Pageview Distribution by Section (%)",
                    labels={'percentage': 'Pageviews (%)', 'section': 'Website Section', 'type': 'Data Set'},
                ) if not combined_pageviews_for_ai.empty else go.Figure().update_layout(title="Not enough data for this plot"))),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Visits by Hour of Day"),
                dbc.CardBody(dcc.Graph(figure=px.line(
                    x=metrics_filtered_website['time'].index,
                    y=metrics_filtered_website['time'].values,
                    title="Visitor Trend by Hour",
                    labels={'x': 'Hour of Day', 'y': 'Visits'},
                ) if not metrics_filtered_website['time'].empty else go.Figure().update_layout(title="Not enough data for this plot"))),
            ]), md=6, className="mb-4"),
        ]),
        # Row 2: Visitor Tags & New vs. Returning
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Visitor Behavior Tag Distribution"),
                dbc.CardBody(dcc.Graph(
                    id='tag-distribution-graph-website', 
                    figure=px.pie(
                        names=metrics_filtered_website['tag_counts'].index,
                        values=metrics_filtered_website['tag_counts'].values,
                        title="Visitor Tag Proportions",
                    ) if not metrics_filtered_website['tag_counts'].empty else go.Figure().update_layout(title="No visitor tags to display")
                )),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("New vs. Returning Visitor Comparison"),
                dbc.CardBody(dcc.Graph(
                    figure=px.bar(
                        x=['New Visitors', 'Returning Visitors'],
                        y=[metrics_filtered_website['new_vs_returning_comp']['new_depth_comparison'], metrics_filtered_website['new_vs_returning_comp']['returning_depth_comparison']],
                        title="Average Session Depth Comparison",
                        labels={'x': 'Visitor Type', 'y': 'Average Depth'},
                    )
                )),
            ]), md=6, className="mb-4"),
        ]),
        # Row 3: Sitemap Heatmap & Hierarchical Chart
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Website Navigation Heatmap"),
                dbc.CardBody(dcc.Graph(
                    figure=px.sunburst(
                        df_filtered_website,
                        path=['section', 'sub-section'],
                        values='id_visit',
                        title='Sitemap Clicks Heatmap'
                    ) if not df_filtered_website.empty else go.Figure().update_layout(title="Not enough data for this plot")
                )),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Investor Profile Engagement by Content Area"),
                dbc.CardBody(dcc.Graph(
                    figure=px.sunburst(
                        df_filtered_website,
                        path=['cluster', 'section'],
                        values='id_visit',
                        title='Cluster-Based Content Engagement'
                    ) if not df_filtered_website.empty and 'cluster' in df_filtered_website.columns and not df_filtered_website['cluster'].nunique() == 1 else go.Figure().update_layout(title="Not enough data for this plot")
                )),
            ]), md=6, className="mb-4"),
        ]),
    ])
    
    return (
        website_dashboard_content,
        generate_website_executive_summary_content(selected_website, metrics_filtered_website, {'avg_depth': global_bench_avg_depth, 'bounce_rate': global_bench_bounce_rate, 'avg_downloads_per_visit': global_avg_downloads_per_visit_for_ai_text}),
        tag_comparison_table_content
    )


# --- Callback for Visitor Company Insights Dashboard ---
@app.callback(
    # Only one output for the entire dashboard content
    Output('visitor-company-dashboard-content', 'children'),
    Input('visitor-company-dropdown', 'value')
)
def update_visitor_company_dashboard(selected_company):
    df_filtered = df_all_with_profiles if selected_company == 'All' else df_all_with_profiles[df_all_with_profiles['company'] == selected_company]

    if df_filtered.empty:
        return dbc.Alert(f"No data available for company: {selected_company}", color="warning")

    filtered_visitor_ids = df_filtered['visitor_id'].unique()
    df_profiles_filtered = df_profiles[df_profiles['visitor_id'].isin(filtered_visitor_ids)]
    
    analyzer_filtered = BehaviorAnalyzer(df_filtered)

    metrics_filtered = {
        'pageview': analyzer_filtered.df['section'].value_counts(),
        'time': analyzer_filtered.df['timestamp'].dt.hour.value_counts().sort_index(),
        'bounce_rate': analyzer_filtered.calculate_bounce_rate()['metrics_data']['overall_bounce_rate'],
        'avg_depth': analyzer_filtered.calculate_average_depth()['metrics_data']['avg_depth_per_visit'],
        'total_visits': df_filtered['id_visit'].nunique(),
        'unique_visitors': df_filtered['visitor_id'].nunique(),
        'total_downloads': df_filtered['download_flag'].sum(),
        'new_vs_returning_comp': analyzer_filtered.compare_new_vs_returning()['metrics_data'],
        'tag_counts': df_profiles_filtered['user_tags'].str.split(',').explode().str.strip().value_counts() if not df_profiles_filtered.empty else pd.Series()
    }
    
    company_pageviews_pct = (metrics_filtered['pageview'] / metrics_filtered['pageview'].sum() * 100).reset_index(name='percentage') if not metrics_filtered['pageview'].empty and metrics_filtered['pageview'].sum() > 0 else pd.DataFrame(columns=['section', 'percentage'])
    company_pageviews_pct.rename(columns={'index': 'section'}, inplace=True)
    company_pageviews_pct['type'] = selected_company
    
    overall_pageviews_pct = (df_processed['section'].value_counts() / len(df_processed) * 100).reset_index(name='percentage') if not df_processed['section'].empty and len(df_processed) > 0 else pd.DataFrame(columns=['section', 'percentage'])
    overall_pageviews_pct.rename(columns={'index': 'section'}, inplace=True)
    overall_pageviews_pct['type'] = 'Overall'
    
    combined_pageviews = pd.concat([company_pageviews_pct, overall_pageviews_pct], ignore_index=True)
    
    # This is the complete content returned for 'visitor-company-dashboard-content'
    visitor_dashboard_content = html.Div([
        # 1. KPI Card Layout with Benchmarks
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Total Visits"),
                dbc.CardBody(html.H4(f"{metrics_filtered['total_visits']}", className="text-center text-secondary")),
                dbc.CardFooter("Total sessions for this company", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Avg Session Depth"),
                dbc.CardBody([
                    html.H4(f"{metrics_filtered['avg_depth']:.2f} Pages", className="text-center text-info"),
                    html.P(f"(Overall Avg: {overall_metrics['avg_depth_results']['avg_depth_per_visit']:.2f})", className="text-center text-muted")
                ]),
                dbc.CardFooter("Avg pages visited per session", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Bounce Rate"),
                dbc.CardBody([
                    html.H4(f"{metrics_filtered['bounce_rate']:.2%}", className="text-center text-danger"),
                    html.P(f"(Overall Avg: {overall_metrics['bounce_results']['overall_bounce_rate']:.2f})", className="text-center text-muted")
                ]),
                dbc.CardFooter("Single-page visit percentage", className="text-muted")
            ], className="h-100"), md=3),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Downloads"),
                dbc.CardBody([
                    html.H4(f"{int(metrics_filtered['total_downloads'])}", className="text-center text-success"),
                    html.P(f"(Overall Avg: {overall_visitor_averages.get('download_count', 0):.1f})", className="text-center text-muted")
                ]),
                dbc.CardFooter("Total file downloads", className="text-muted")
            ], className="h-100"), md=3),
        ], className="mb-4"),

        # 2. Interactive Charts with Benchmarks
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Pageview Percentage Comparison (Company vs. Overall)"),
                dbc.CardBody(dcc.Graph(figure=px.bar(
                    combined_pageviews, 
                    x='section', 
                    y='percentage',
                    color='type',
                    barmode='group',
                    title="Pageview Distribution by Section (%)",
                    labels={'percentage': 'Pageviews (%)', 'section': 'Website Section', 'type': 'Data Set'},
                ) if not combined_pageviews.empty else go.Figure().update_layout(title="Not enough data for this plot"))),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Visits by Hour of Day"),
                dbc.CardBody(dcc.Graph(figure=px.line(
                    x=metrics_filtered['time'].index,
                    y=metrics_filtered['time'].values,
                    title="Visitor Trend by Hour",
                    labels={'x': 'Hour of Day', 'y': 'Visits'},
                ) if not metrics_filtered['time'].empty else go.Figure().update_layout(title="Not enough data for this plot"))),
            ]), md=6, className="mb-4"),
        ]),
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Visitor Behavior Tag Distribution"),
                dbc.CardBody(dcc.Graph(
                    id='tag-distribution-graph-visitor', 
                    figure=px.pie(
                        names=metrics_filtered['tag_counts'].index,
                        values=metrics_filtered['tag_counts'].values,
                        title="Visitor Tag Proportions",
                    ) if not metrics_filtered['tag_counts'].empty else go.Figure().update_layout(title="No visitor tags to display")
                )),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("New vs. Returning Visitor Comparison"),
                dbc.CardBody(dcc.Graph(
                    figure=px.bar(
                        x=['New Visitors', 'Returning Visitors'],
                        y=[metrics_filtered['new_vs_returning_comp']['new_depth_comparison'], metrics_filtered['new_vs_returning_comp']['returning_depth_comparison']],
                        title="Average Session Depth Comparison",
                        labels={'x': 'Visitor Type', 'y': 'Average Depth'},
                    )
                )),
            ]), md=6, className="mb-4"),
        ]),
        dbc.Row([
            dbc.Col(dbc.Card([
                dbc.CardHeader("Website Navigation Heatmap"),
                dbc.CardBody(dcc.Graph(
                    figure=px.sunburst(
                        df_filtered,
                        path=['section', 'sub-section'],
                        values='id_visit',
                        title='Sitemap Clicks Heatmap'
                    ) if not df_filtered.empty else go.Figure().update_layout(title="Not enough data for this plot")
                )),
            ]), md=6, className="mb-4"),
            dbc.Col(dbc.Card([
                dbc.CardHeader("Investor Profile Engagement by Content Area"),
                dbc.CardBody(dcc.Graph(
                    figure=px.sunburst(
                        df_filtered,
                        path=['cluster', 'section'],
                        values='id_visit',
                        title='Cluster-Based Content Engagement'
                    ) if not df_filtered.empty and 'cluster' in df_filtered.columns and not df_filtered['cluster'].nunique() == 1 else go.Figure().update_layout(title="Not enough data for this plot")
                )),
            ]), md=6, className="mb-4"),
        ]),
    ])
    return visitor_dashboard_content # Return the complete content for this tab

@app.callback(
    Output('elbow-method-chart', 'figure'),
    Output('silhouette-score-chart', 'figure'),
    Input('tabs-main', 'active_tab')
)
def update_clustering_diagnostics(active_tab):
    global overall_metrics

    # Check if we have the necessary data
    if 'clustering_results' not in overall_metrics or 'clustering_scores' not in overall_metrics['clustering_results']:
        return go.Figure().update_layout(title="Clustering data not available"), go.Figure().update_layout(title="Clustering data not available")

    score_data = pd.DataFrame(overall_metrics['clustering_results']['clustering_scores'])

    # Elbow Method Chart
    elbow_fig = px.line(score_data, x='n_clusters', y='Distortion (Inertia)', markers=True,
                         title='Elbow Method for Optimal K',
                         labels={'n_clusters': 'Number of Clusters (k)', 'Distortion (Inertia)': 'Inertia'},
                         template='plotly_white')
    elbow_fig.update_layout(xaxis_title="Number of Clusters (k)")
    elbow_fig.update_layout(yaxis_title="Inertia (Distortion)")

    # Silhouette Score Chart
    silhouette_fig = px.line(score_data.dropna(), x='n_clusters', y='Silhouette Score', markers=True,
                             title='Silhouette Scores for Optimal K',
                             labels={'n_clusters': 'Number of Clusters (k)', 'Silhouette Score': 'Silhouette Score'},
                             template='plotly_white')
    silhouette_fig.update_layout(xaxis_title="Number of Clusters (k)")
    silhouette_fig.update_layout(yaxis_title="Silhouette Score")
    
    # Highlight the best k value (if available)
    if not score_data['Silhouette Score'].isnull().all():
        best_k = score_data.loc[score_data['Silhouette Score'].idxmax(), 'n_clusters']
        silhouette_fig.add_vline(x=best_k, line_width=2, line_dash="dash", line_color="red",
                                 annotation_text=f"Optimal k = {best_k}", annotation_position="top right")

    return elbow_fig, silhouette_fig

@app.callback(
    Output('individual-visitor-content', 'children'),
    Input('visitor-table', 'selected_rows')
)
def update_individual_visitor_stats(selected_rows):
    if not selected_rows:
        return html.Div([html.H4("No Visitor Selected"), html.P("Please select a row in the table above to view the visitor's details.")])
        
    selected_row_index = selected_rows[0]
    selected_visitor_id = df_profiles.iloc[selected_row_index]['visitor_id']
    
    try:
        user_profile_df = df_profiles[df_profiles['visitor_id'] == selected_visitor_id].copy()
        if user_profile_df.empty:
            return html.Div([html.H4("Visitor Profile Not Found"), html.P(f"No detailed profile found for Visitor ID: {selected_visitor_id}.")])
        user_profile = user_profile_df.iloc[0]

        for col in BehaviorAnalyzer.CLUSTERING_FEATURES:
            if col not in user_profile:
                user_profile[col] = 0.0

        analyzer = BehaviorAnalyzer(df_processed) 
        user_story = analyzer.generate_user_story(selected_visitor_id, overall_metrics_dict=overall_metrics, overall_visitor_averages=overall_visitor_averages)
        
        visitor_company = user_profile.get('company', 'N/A')

        user_visits = df_processed[df_processed['visitor_id'] == selected_visitor_id].sort_values(['id_visit', 'timestamp'])
        user_visits_cleaned = user_visits.dropna(subset=['section'])
        user_visits_cleaned = user_visits_cleaned[user_visits_cleaned['section'].str.lower() != 'nan'] 
        
        if user_visits_cleaned.empty:
            sankey_fig = go.Figure()
            sankey_fig.update_layout(title="No valid path data")
        else:
            paths = user_visits_cleaned.groupby('id_visit')['section'].apply(lambda x: ' -> '.join(x.astype(str))).tolist()
            path_counts = pd.Series(paths).value_counts().reset_index()
            path_counts.columns = ['path', 'count']
            
            nodes = pd.unique(path_counts['path'].str.split(' -> ').explode()).tolist()
            node_to_id = {node: i for i, node in enumerate(nodes)}
            
            source = []
            target = []
            value = []
            
            for _, row in path_counts.iterrows():
                path_segments = row['path'].split(' -> ')
                for i in range(len(path_segments) - 1):
                    source.append(node_to_id.get(path_segments[i], None))
                    target.append(node_to_id.get(path_segments[i+1], None))
                    value.append(row['count'])
            
            valid_indices = [i for i, x in enumerate(source) if x is not None and target[i] is not None]
            source = [source[i] for i in valid_indices]
            target = [target[i] for i in valid_indices]
            value = [value[i] for i in valid_indices]

            sankey_fig = go.Figure(go.Sankey(
                node=dict(
                    pad=15, thickness=20,
                    line=dict(color="black", width=0.5),
                    label=nodes,
                ),
                link=dict(
                    source=source,
                    target=target,
                    value=value,
                )
            ))
            sankey_fig.update_layout(title_text=f"Visitor ID {selected_visitor_id} Path Flow", font_size=10)

        profile_data = {
            'Metric': ['Average Session Depth', 'Download Count', 'IR Interest Score', 'Content Breadth', 'Visit Count', 'Repeat Visitor'],
            'This Visitor': [
                f"{user_profile.get('avg_session_depth', 0.0):.2f}", 
                f"{int(user_profile.get('download_count', 0.0))}", 
                f"{int(user_profile.get('investor_interest_score', 0.0))}", 
                f"{int(user_profile.get('content_breadth', 0.0))}",
                f"{int(user_profile.get('visit_count', 0.0))}",
                f"{'Yes' if user_profile.get('is_repeat_visitor', 0) > 0 else 'No'}"
            ],
            'Overall Average': [
                f"{overall_visitor_averages.get('avg_session_depth', 0.0):.2f}", 
                f"{overall_visitor_averages.get('download_count', 0.0):.1f}", 
                f"{overall_visitor_averages.get('investor_interest_score', 0.0):.1f}", 
                f"{overall_visitor_averages.get('content_breadth', 0.0):.1f}",
                f"{overall_visitor_averages.get('visit_count', 0.0):.1f}",
                f"{overall_visitor_averages.get('is_repeat_visitor', 0.0):.2%}" 
            ],
        }

        radar_data = pd.DataFrame({
            'variable': ['Session Depth', 'Downloads', 'IR Interest', 'Content Breadth'],
            'visitor_value': [
                user_profile.get('avg_session_depth', 0.0),
                user_profile.get('download_count', 0.0),
                user_profile.get('investor_interest_score', 0.0),
                user_profile.get('content_breadth', 0.0)
            ],
            'overall_avg': [
                overall_visitor_averages.get('avg_session_depth', 0.0),
                overall_visitor_averages.get('download_count', 0.0),
                overall_visitor_averages.get('investor_interest_score', 0.0),
                overall_visitor_averages.get('content_breadth', 0.0)
            ]
        })
        
        def min_max_scale(df_series):
            if df_series.empty or (df_series.max() - df_series.min()) == 0:
                return pd.Series([0] * len(df_series), index=df_series.index)
            return (df_series - df_series.min()) / (df_series.max() - df_series.min())
        
        df_normalized = pd.DataFrame({
            'variable': radar_data['variable'],
            'visitor_value': min_max_scale(radar_data['visitor_value']),
            'overall_avg': min_max_scale(radar_data['overall_avg'])
        })
        
        radar_fig = go.Figure()
        radar_fig.add_trace(go.Scatterpolar(
            r=df_normalized['visitor_value'],
            theta=df_normalized['variable'],
            fill='toself',
            name='This Visitor'
        ))
        radar_fig.add_trace(go.Scatterpolar(
            r=df_normalized['overall_avg'],
            theta=df_normalized['variable'],
            fill='toself',
            name='Overall Average'
        ))
        radar_fig.update_layout(
            polar=dict(
                radialaxis=dict(visible=True, range=[-0.1, 1.1]), 
                bgcolor='rgba(255, 255, 255, 0)' 
            ),
            showlegend=True,
            title='Visitor Behavior vs. Overall Average (Normalized)'
        )

        user_tags_list = user_profile.get('user_tags', '').split(',')
        tag_badges = []
        for i, tag in enumerate(user_tags_list):
            if not tag:
                continue
            badge_id = f"tag-badge-{selected_visitor_id}-{i}" 
            tag_badges.append(
                dbc.Badge(
                    tag,
                    color="info",
                    className="me-1",
                    id=badge_id
                )
            )

            tooltip_content = [html.P(tag_explanations.get(tag.strip(), "No explanation available."))]
            
            similar_visitors = df_profiles[df_profiles['user_tags'].str.contains(tag, case=False, na=False)]
            similar_visitors = similar_visitors[similar_visitors['visitor_id'] != selected_visitor_id] 
            
            if not similar_visitors.empty:
                tooltip_content.append(html.Hr())
                tooltip_content.append(html.P(f"Other visitors with '{tag}' tag:", className="mb-1"))
                similar_visitors_list = []
                for _, row in similar_visitors.head(5).iterrows(): 
                    similar_visitors_list.append(html.Li(f"ID: {row['visitor_id']} (Company: {row['company']})"))
                tooltip_content.append(html.Ul(similar_visitors_list))
            
            tag_badges.append(
                dbc.Tooltip(
                    html.Div(tooltip_content),
                    target=badge_id,
                    placement="top",
                )
            )
        
        return dbc.Card(
            dbc.CardBody([
                dbc.Row([
                    dbc.Col([
                        html.H3(f'Detailed Profile for Visitor ID: {selected_visitor_id}'),
                        html.P(f"**Visitor Company**: {visitor_company}", className="lead")
                    ], width=12, className="mb-4")
                ]),
                dbc.Row([
                    dbc.Col([
                        dbc.Card([
                            dbc.CardHeader("AI-Generated Visitor Story"),
                            dbc.CardBody(dcc.Markdown(user_story, className="story-text"))
                        ], className="mb-4"),
                        dbc.Card([
                            dbc.CardHeader("Visitor vs. Overall Behavior Comparison"),
                            dbc.CardBody(dash_table.DataTable(
                                id='comparison-table',
                                columns=[{"name": i, "id": i} for i in profile_data.keys()],
                                data=pd.DataFrame(profile_data).to_dict('records'),
                                style_cell={'textAlign': 'left'},
                            ))
                        ]),
                    ], md=6),
                    dbc.Col([
                        dbc.Card([
                            dbc.CardHeader("Visitor Tags"),
                            dbc.CardBody(tag_badges),
                        ], className="mb-4"),
                        dbc.Card([
                            dbc.CardHeader("Visitor Path Flow (Sankey Diagram)"),
                            dbc.CardBody(dcc.Graph(figure=sankey_fig))
                        ], className="mb-4"),
                        dbc.Card([
                            dbc.CardHeader("Visual Behavior Comparison (Normalized)"),
                            dbc.CardBody(dcc.Graph(figure=radar_fig))
                        ])
                    ], md=6),
                ]),
            ])
        )
    except Exception as e:
        return html.Div([
            html.H4("An error occurred while displaying visitor details."),
            html.P(f"Error: {e}"),
            html.P("This might be due to incomplete data for the selected visitor.")
        ])
        
## --- NEW CALLBACK: Control Visibility of Global Modal Buttons ---
@app.callback(
    Output("open-tag-glossary-modal-website", "style"),
    Output("open-cluster-glossary-modal-website-tab", "style"),
    Output("open-cluster-glossary-modal-visitor-tab", "style"),
    Input("tabs-main", "active_tab")
)
def control_modal_button_visibility(active_tab):
    style_show = {'display': 'inline-block', 'margin-left': '8px'} # Use inline-block for buttons
    style_hide = {'display': 'none'}

    tag_modal_button_style = style_hide
    cluster_website_modal_button_style = style_hide
    cluster_visitor_modal_button_style = style_hide

    if active_tab == 'tab-website-company':
        tag_modal_button_style = style_show
        cluster_website_modal_button_style = style_show
    elif active_tab == 'tab-visitor-company':
        cluster_visitor_modal_button_style = style_show
    
    return tag_modal_button_style, cluster_website_modal_button_style, cluster_visitor_modal_button_style


## --- NEW CALLBACKS FOR GLOSSARY MODALS ---
# Tag Glossary Modal Toggle
@app.callback(
    Output("tag-glossary-modal", "is_open"),
    Output("tag-glossary-modal-body", "children"),
    Input("open-tag-glossary-modal-website", "n_clicks"),
    Input("close-tag-glossary-modal", "n_clicks"),
    State("tag-glossary-modal", "is_open"),
)
def toggle_tag_glossary_modal(n_open, n_close, is_open):
    ctx = dash.callback_context
    if not ctx.triggered:
        return is_open, []

    button_id = ctx.triggered[0]['prop_id'].split('.')[0]

    if button_id == "open-tag-glossary-modal-website":
        modal_content = []
        for tag, explanation in tag_explanations.items():
            modal_content.append(html.H5(tag, className="mt-3"))
            modal_content.append(html.P(explanation))
        return True, modal_content
    elif button_id == "close-tag-glossary-modal":
        return False, []
    return is_open, []


# Cluster Glossary Modal Toggle
@app.callback(
    Output("cluster-glossary-modal", "is_open"),
    Output("cluster-glossary-modal-body", "children"),
    Input("open-cluster-glossary-modal-website-tab", "n_clicks"), 
    Input("open-cluster-glossary-modal-visitor-tab", "n_clicks"), 
    Input("close-cluster-glossary-modal", "n_clicks"),
    State("cluster-glossary-modal", "is_open"),
)



    
def toggle_cluster_glossary_modal(n_open_website, n_open_visitor, n_close, is_open): 
    global cluster_personas_data
    ctx = dash.callback_context
    if not ctx.triggered:
        return is_open, html.Div("No detailed cluster personas available.") # Provide a default message content

    button_id = ctx.triggered[0]['prop_id'].split('.')[0]

    if button_id in ["open-cluster-glossary-modal-website-tab", "open-cluster-glossary-modal-visitor-tab"]:
        modal_content = generate_cluster_full_description_content() 
        return True, modal_content
    elif button_id == "close-cluster-glossary-modal":
        return False, html.Div("No detailed cluster personas available.") # Also return default message when closing
    return is_open, html.Div("No detailed cluster personas available.") # Default for other cases

# --- Callback for Event Impact Analysis Tab ---
@app.callback(
    Output('event-impact-ai-summary', 'children'),
    Output('company_a-visits-time-series', 'figure'),
    Output('company_a-downloads-time-series', 'figure'),
    Output('company_a-event-summary-table', 'data'),
    Output('company_a-event-summary-table', 'columns'),
    Output('cluster-event-impact-heatmap', 'figure'),
    Output('cluster-event-impact-table', 'children'),
    Output('cluster-content-interest-chart', 'figure'),
    Input('tabs-main', 'active_tab')
)
def update_event_impact_tab(active_tab):
    global cluster_personas_data, df_events, df_event_impact_summary, df_daily_vodafone_behavior_clustered, df_daily_vodafone_behavior_overall, df_all_with_profiles # 确保 df_all_with_profiles 也在这里声明为 global

    if active_tab != 'tab-event-impact':
        return (
            "", # 1. AI Summary
            go.Figure().update_layout(title="Select 'Event Impact Analysis' tab to view"), # 2. Visits Figure
            go.Figure().update_layout(title="Select 'Event Impact Analysis' tab to view"), # 3. Downloads Figure
            [], # 4. Event Summary Table Data
            [], # 5. Event Summary Table Columns
            go.Figure().update_layout(title="Select 'Event Impact Analysis' tab to view"), # 6. Heatmap
            html.Div(), # 7. Cluster Impact Table (empty Div)
            go.Figure().update_layout(title="Select 'Event Impact Analysis' tab to view") # 8. cluster-content-interest-chart (新增的占位符)
        )

    print(f"\n--- Debugging update_event_impact_tab ---")
    print(f"df_events shape: {df_events.shape if not df_events.empty else 'Empty'}")
    print(f"df_event_impact_summary shape: {df_event_impact_summary.shape if not df_event_impact_summary.empty else 'Empty'}")
    print(f"df_daily_company_a_behavior_clustered shape: {df_daily_company_a_behavior_clustered.shape if not df_daily_company_a_behavior_clustered.empty else 'Empty'}")
    print(f"df_daily_company_a_behavior_overall shape: {df_daily_company_a_behavior_overall.shape if not df_daily_company_a_behavior_overall.empty else 'Empty'}")
    print(f"cluster_personas_data length: {len(cluster_personas_data) if cluster_personas_data else 'Empty'}")


    df_vodafone_for_analysis = df_all_with_profiles[df_all_with_profiles['website_company'] == 'Vodafone'].copy()
    if df_vodafone_for_analysis.empty:
       
        ai_summary = "No Company A data available for analysis in Event Impact tab."
        fig_visits = go.Figure().update_layout(title="No Company A data for Visits plot.")
        fig_downloads = go.Figure().update_layout(title="No Company A data for Downloads plot.")
        table_data = []
        table_columns = []
        heatmap_fig = go.Figure().update_layout(title="No Company A data for Heatmap.")
        cluster_impact_table_content = html.Div("No Company A data for Detailed Cluster Event Impact.")
        fig_cluster_content_interest = go.Figure().update_layout(title="No Company A data for Content Interest Chart.")
        return ai_summary, fig_visits, fig_downloads, table_data, table_columns, heatmap_fig, cluster_impact_table_content, fig_cluster_content_interest
    analyzer_for_tab = BehaviorAnalyzer(df_vodafone_for_analysis)


    # Generate AI Summary
    if df_event_impact_summary.empty:
        ai_summary = "No event impact data available to generate insights. Please ensure 'company_a_event_impact_summary.csv' is present and contains data."
    else:
        ai_summary = generate_event_ai_analysis(df_event_impact_summary)

    fig_visits = go.Figure()
    fig_downloads = go.Figure()
    
    fig_cluster_content_interest = go.Figure().update_layout(title="No content interest data by cluster available.")
    
    if not df_daily_vodafone_behavior_overall.empty:
        fig_visits = px.line(df_daily_vodafone_behavior_overall, x='date', y='total_visits',
                             title='Company A Daily Total Visits Over Time with Financial Events',
                             labels={'date': 'Date', 'total_visits': 'Total Unique Visits'},
                             template='plotly_white')
        fig_downloads = px.line(df_daily_vodafone_behavior_overall, x='date', y='total_downloads',
                                 title='Company A Daily Total Downloads Over Time with Financial Events',
                                 labels={'date': 'Date', 'total_downloads': 'Total Downloads'},
                                 template='plotly_white')

        # Add event markers to the plots
        if not df_events.empty: # Only add markers if event data exists
            for index, event in df_events.iterrows():
                event_datetime = event['event_date'].to_pydatetime()
                event_unix_timestamp_ms = event_datetime.timestamp() * 1000 # Convert to milliseconds for Plotly

                # Add to Visits plot
                fig_visits.add_vline(x=event_unix_timestamp_ms, line_width=1, line_dash="dash", line_color="red",
                                     annotation_text=event['event_title'], annotation_position="top right",
                                     annotation_font_size=10, annotation_font_color="red")
                
                # Add to Downloads plot
                fig_downloads.add_vline(x=event_unix_timestamp_ms, line_width=1, line_dash="dash", line_color="blue",
                                        annotation_text=event['event_title'], annotation_position="top right",
                                        annotation_font_size=10, annotation_font_color="blue")
        
        fig_visits.update_layout(hovermode="x unified")
        fig_downloads.update_layout(hovermode="x unified")
    else:
        fig_visits = go.Figure().update_layout(title="No daily Vodafone behavior data available for Visits plot.")
        fig_downloads = go.Figure().update_layout(title="No daily Vodafone behavior data available for Downloads plot.")

    # 3. Prepare Event Summary Table Data
    table_data = df_event_impact_summary.to_dict('records') if not df_event_impact_summary.empty else []
    table_columns = [{"name": col, "id": col} for col in df_event_impact_summary.columns] if not df_event_impact_summary.empty else []

    # 4. Prepare Cluster-level Event Impact Analysis
    cluster_event_impact_data = []
    if not df_daily_vodafone_behavior_clustered.empty and not df_events.empty and cluster_personas_data:
        pre_event_window = 7
        post_event_window = 7
        baseline_offset = 20 # Days before pre-event window starts
        baseline_duration = 10 # Duration of baseline window

        for _, event in df_events.iterrows():
            event_date = event['event_date']
            event_title = event['event_title']
            
            event_window_start = event_date - timedelta(days=pre_event_window)
            event_window_end = event_date + timedelta(days=post_event_window)
            
            baseline_start = event_date - timedelta(days=pre_event_window + baseline_offset)
            baseline_end = event_date - timedelta(days=pre_event_window + baseline_offset - 1 + baseline_duration)

            event_window_behavior = df_daily_vodafone_behavior_clustered[
                (df_daily_vodafone_behavior_clustered['date'] >= event_window_start) & 
                (df_daily_vodafone_behavior_clustered['date'] <= event_window_end)
            ]
            baseline_behavior = df_daily_vodafone_behavior_clustered[
                (df_daily_vodafone_behavior_clustered['date'] >= baseline_start) & 
                (df_daily_vodafone_behavior_clustered['date'] <= baseline_end)
            ]

            print(f"  Processing event: {event_title} ({event_date.strftime('%Y-%m-%d')})")
            print(f"    Event window behavior rows: {len(event_window_behavior)}")
            print(f"    Baseline behavior rows: {len(baseline_behavior)}")

            avg_event_metrics = event_window_behavior.groupby('cluster').agg(
                avg_visits=('total_visits', 'mean'),
                avg_downloads=('total_downloads', 'mean'),
                avg_session_depth=('avg_session_depth', 'mean')
            )
            avg_baseline_metrics = baseline_behavior.groupby('cluster').agg(
                avg_visits=('total_visits', 'mean'),
                avg_downloads=('total_downloads', 'mean'),
                avg_session_depth=('avg_session_depth', 'mean')
            )

            combined_metrics = avg_event_metrics.join(avg_baseline_metrics, how='outer', 
                                                     lsuffix='_event', rsuffix='_baseline').fillna(0)

            for cluster_id in combined_metrics.index.unique(): # Use .unique() to avoid duplicates
                cluster_name = f"Cluster {int(cluster_id)}" # Ensure cluster_id is int for formatting
                for persona in cluster_personas_data:
                    if persona['num'] == cluster_id: # cluster_id from df is int, persona['num'] is int
                        cluster_name = persona['name']
                        break

                avg_visits_event = combined_metrics.loc[cluster_id, 'avg_visits_event']
                avg_visits_baseline = combined_metrics.loc[cluster_id, 'avg_visits_baseline']
                
                avg_downloads_event = combined_metrics.loc[cluster_id, 'avg_downloads_event']
                avg_downloads_baseline = combined_metrics.loc[cluster_id, 'avg_downloads_baseline']
                
                avg_session_depth_event = combined_metrics.loc[cluster_id, 'avg_session_depth_event']
                avg_session_depth_baseline = combined_metrics.loc[cluster_id, 'avg_session_depth_baseline']

                # Recalculate percentage changes, explicitly setting to NaN if baseline is 0
                visits_change_pct = ((avg_visits_event - avg_visits_baseline) / avg_visits_baseline * 100) if avg_visits_baseline != 0 else np.nan
                downloads_change_pct = ((avg_downloads_event - avg_downloads_baseline) / avg_downloads_baseline * 100) if avg_downloads_baseline != 0 else np.nan
                session_depth_change_pct = ((avg_session_depth_event - avg_session_depth_baseline) / avg_session_depth_baseline * 100) if avg_session_depth_baseline != 0 else np.nan
                
                # --- Debugging: Check calculated percentages ---
                print(f"    Cluster {cluster_id} ({cluster_name}) changes for event {event_title}:")
                print(f"      Visits change: {visits_change_pct:.2f}% (Event: {avg_visits_event:.2f}, Baseline: {avg_visits_baseline:.2f})")
                print(f"      Downloads change: {downloads_change_pct:.2f}% (Event: {avg_downloads_event:.2f}, Baseline: {avg_downloads_baseline:.2f})")
                print(f"      Session Depth change: {session_depth_change_pct:.2f}% (Event: {avg_session_depth_event:.2f}, Baseline: {avg_session_depth_baseline:.2f})")


                cluster_event_impact_data.append({
                    'Event Title': event_title,
                    'Event Date': event_date.strftime('%Y-%m-%d'),
                    'Cluster ID': int(cluster_id), # Store as int
                    'Cluster Name': cluster_name,
                    'Change in Visits (%)': round(visits_change_pct, 2) if pd.notna(visits_change_pct) else np.nan,
                    'Change in Downloads (%)': round(downloads_change_pct, 2) if pd.notna(downloads_change_pct) else np.nan,
                    'Change in Session Depth (%)': round(session_depth_change_pct, 2) if pd.notna(session_depth_change_pct) else np.nan
                })

    df_cluster_event_impact = pd.DataFrame(cluster_event_impact_data)
    
    # Ensure numeric columns for pivot_table, handling NaNs
    for col_name in ['Change in Visits (%)', 'Change in Downloads (%)', 'Change in Session Depth (%)']:
        if col_name in df_cluster_event_impact.columns:
            df_cluster_event_impact[col_name] = pd.to_numeric(df_cluster_event_impact[col_name], errors='coerce')


    # 5. Visualization: Heatmap for Cluster Impact
    heatmap_fig = go.Figure().update_layout(title="No sufficient clustered event impact data or all values are NaN for heatmap.") # Default empty figure with message
    
    if not df_cluster_event_impact.empty and 'Change in Visits (%)' in df_cluster_event_impact.columns and not df_cluster_event_impact['Change in Visits (%)'].isnull().all():
        df_for_heatmap = df_cluster_event_impact.dropna(subset=['Change in Visits (%)'])
        
        if not df_for_heatmap.empty:
            heatmap_data = df_for_heatmap.pivot_table(
                index='Cluster Name', columns='Event Title', values='Change in Visits (%)'
            )
            
            if not df_event_impact_summary.empty:
                sorted_event_titles = df_event_impact_summary.sort_values('Event Date')['Event Title'].tolist()
                actual_event_titles_in_heatmap = [title for title in sorted_event_titles if title in heatmap_data.columns]
                # Filter heatmap_data to only include columns that are actually present AND sorted
                heatmap_data = heatmap_data[actual_event_titles_in_heatmap] 
            else:
                heatmap_data = heatmap_data[heatmap_data.columns.tolist()] # Reorder based on current columns

            heatmap_fig = px.heatmap(
                heatmap_data,
                title='Change in Visits (%) by Investor Cluster Around Financial Events',
                labels={'x': 'Event Title', 'y': 'Investor Cluster', 'color': 'Change in Visits (%)'},
                color_continuous_scale=px.colors.sequential.RdBu,
                color_continuous_midpoint=0,
                height=400
            )
            heatmap_fig.update_xaxes(tickangle=45) 
            # Add text on cells for better readability (optional)
            heatmap_fig.update_traces(texttemplate="%{z:.2f}%", textfont_size=10)
        else:
            heatmap_fig = go.Figure().update_layout(title="All 'Change in Visits (%)' values are NaN after filtering for heatmap.")


    # 6. Display Cluster Impact in a Table
    cluster_impact_table_content = html.Div([
        html.H4("Detailed Investor Cluster Behavior Changes Around Events"),
        html.P("This table shows the percentage change in key metrics for each investor cluster around specific Vodafone events compared to a baseline period."),
        dash_table.DataTable(
            id='cluster-event-impact-detail-table',
            columns=[{"name": col, "id": col} for col in df_cluster_event_impact.columns],
            data=df_cluster_event_impact.to_dict('records'),
            style_table={'overflowX': 'auto'},
            page_action="native",
            page_current=0,
            page_size=10,
            sort_action="native",
            sort_mode="multi",
            style_header={'backgroundColor': 'rgb(230, 230, 230)', 'fontWeight': 'bold'},
            style_cell={'textAlign': 'left', 'minWidth': '100px', 'width': '100px', 'maxWidth': '180px'},
        ) if not df_cluster_event_impact.empty else html.P("No detailed cluster event impact data available.")
    ])

    # --- FINAL RETURN STATEMENT (THIS MUST RETURN 8 ELEMENTS) ---
    return ai_summary, fig_visits, fig_downloads, table_data, table_columns, heatmap_fig, cluster_impact_table_content, fig_cluster_content_interest

if __name__ == '__main__':
    # During development/debugging, keep this for easier traceback.
    # For production, consider removing it to allow Dash to expose client-side errors.
    app.config.suppress_callback_exceptions = True 
    from IPython import get_ipython
    if get_ipython() is not None:
        get_ipython().run_line_magic('config', 'InteractiveShell.xmode = "Plain"')
        
    # --- Instantiate and train the predictive model here ---
    # You need to make sure the PredictiveModeler class is defined earlier in your script.
    predictive_modeler = PredictiveModeler(df_processed)
    df_longitudinal_data = predictive_modeler.prepare_longitudinal_data()
    gbm_model, gbm_metrics = predictive_modeler.train_and_evaluate_gbm(df_longitudinal_data)

    if gbm_model is not None:
        print("\n--- Predictive Model Training Complete ---")
        print("Model Performance Metrics (from Time-Series Cross-Validation):")
        print(f"Mean AUC: {gbm_metrics['mean_auc']:.4f}")
        print(f"Mean MAE: {gbm_metrics['mean_mae']:.4f}")
        # Optional: You can store these metrics in the overall_metrics dictionary
        overall_metrics['predictive_model_metrics'] = gbm_metrics
    else:
        print("\n--- Predictive Model Training Failed ---")
        print(f"Reason: {gbm_metrics}")
    
    threading.Thread(target=open_browser_after_startup).start()
    app.run(debug=True)