In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import urlparse
import seaborn as sns
import io
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import warnings

# Set a style for matplotlib plots and default DPI
sns.set_theme(style="whitegrid")
plt.rcParams['figure.dpi'] = 100

class DataLoader:
    # ... (DataLoader class remains unchanged) ...
    def __init__(self, file_path):
        self.file_path = file_path

    def load_and_preprocess_data(self):
        df_all = pd.read_csv(self.file_path)

        core_event_identifiers = ['id_visit', 'url', 'timestamp', 'download_flag', 'time_spent']
        df_all = df_all.drop_duplicates(subset=core_event_identifiers, keep='first')

        df_all['timestamp'] = pd.to_datetime(df_all['timestamp'], errors='coerce')
        df_all = df_all.dropna(subset=['timestamp'])
        df_all = df_all.sort_values(['id_visit', 'timestamp']).reset_index(drop=True)

        df_all = df_all[df_all['section'] != 'Other']
        df_all = df_all[df_all['section'].notna() & (df_all['section'].astype(str).str.strip() != '')]
        df_all = df_all[df_all['sub-section'].notna() & (df_all['sub-section'].astype(str).str.strip() != '')]

        df_all['company'] = df_all['ultimate_parent_name'].fillna('Unknown')

        # --- FIX FOR KeyError: '[nan] not in index' (already implemented previously) ---
        # Crucially, filter out rows where visitor_id is NaN *before* grouping by visitor_id
        df_filtered_visitors = df_all.dropna(subset=['visitor_id'])

        if df_filtered_visitors.empty:
            print("Warning: No valid visitor_ids found after filtering. 'is_new_visitor_session' will be all False.")
            df_all['is_new_visitor_session'] = False
        else:
            # 1. Find the earliest visit timestamp for each visitor_id (indexed by visitor_id)
            first_visit_time_per_visitor = df_filtered_visitors.groupby('visitor_id')['timestamp'].min()

            # 2. For each session (id_visit), find its earliest timestamp (indexed by id_visit)
            first_timestamp_per_visit = df_filtered_visitors.groupby('id_visit')['timestamp'].min()

            # 3. For each session (id_visit), get its visitor_id (indexed by id_visit)
            visitor_id_per_visit_map = df_filtered_visitors.groupby('id_visit')['visitor_id'].first()

            # --- FIX FOR ValueError: "Can only compare identically-labeled Series objects" ---
            # We need to map the 'first_visit_time_per_visitor' (indexed by visitor_id)
            # onto the 'id_visit' index using the visitor_id_per_visit_map.
            # This creates a Series, indexed by id_visit, containing the *visitor's overall first timestamp* for each session.
            visitor_overall_first_timestamp_for_session = visitor_id_per_visit_map.map(first_visit_time_per_visitor)

            # Now, compare the *session's first timestamp* with the *visitor's overall first timestamp*
            # Both Series are now indexed by 'id_visit', allowing direct comparison.
            is_first_session_for_visitor = (first_timestamp_per_visit == visitor_overall_first_timestamp_for_session)
            
            # Map this session-level flag back to the original (potentially unfiltered) df_all
            df_all['is_new_visitor_session'] = df_all['id_visit'].map(is_first_session_for_visitor).fillna(False)
            
            # Final check to ensure no None/NaN in visitor_id leads to issues in subsequent steps
            # Convert visitor_id to int, filling NaNs with -1. This should be done before any groupby('visitor_id')
            # For this exact error, it's about the index comparison, but keeping visitor_id clean is good practice.
            # It's better to do df_all['visitor_id'] = df_all['visitor_id'].fillna(-1).astype(int)
            # at the beginning of load_and_preprocess_data after initial read_csv
            
            # Let's add it earlier for safety
            # (Note: if you run this, make sure to add it at the top of the method, right after pd.read_csv)
            # For now, let's just make sure it's handled for the map
            df_all['visitor_id'] = pd.to_numeric(df_all['visitor_id'], errors='coerce') # Ensure numeric
            
            # Filter NaNs in visitor_id_per_visit_map if it somehow got NaNs in index/values.
            # However, the earlier df_filtered_visitors.dropna(subset=['visitor_id']) should largely prevent this.
            
            # If for some reason df_filtered_visitors.groupby('id_visit')['visitor_id'].first()
            # still produces NaNs, you'd need to handle that, e.g.,
            # visitor_id_per_visit_map_clean = visitor_id_per_visit_map.dropna()
            # is_first_session_for_visitor = (first_timestamp_per_visit.loc[visitor_id_per_visit_map_clean.index] == first_visit_time_per_visitor.loc[visitor_id_per_visit_map_clean.values])
            # The current approach is correct for non-NaN visitor_ids.

        return df_all

    @staticmethod
    def get_company_category(url):
        if not isinstance(url, str):
            return 'invalid_url', 'invalid_url'
        if '://' not in url:
            url = 'http://' + url

        path = urlparse(url).path
        path_parts = path.strip('/').split('/')

        if not path_parts or path_parts == ['']:
            return 'home page', 'N/A'

        company_section = path_parts[0]
        company_subsection = path_parts[1] if len(path_parts) > 1 else 'N/A'

        return company_section, company_subsection

In [18]:
# ... (Previous imports and DataLoader class remain the same) ...

class BehaviorAnalyzer:
    # Define CLUSTERING_FEATURES as a class attribute
    CLUSTERING_FEATURES = [
        'avg_session_depth', 'download_count', 'investor_interest_score',
        'content_breadth', 'visit_count', 'is_repeat_visitor',
        'has_download', 'has_ir', 'esg_visitor',
        'ir_only_visitor', 'frequent_downloader', 'deep_path_visitor', 'is_high_intent'
    ]

    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.all_chart_infos = []

    def _collect_chart_info(self, fig_obj, title, description):
        img_buffer = io.BytesIO()
        fig_obj.savefig(img_buffer, format='png', bbox_inches='tight', dpi=150)
        img_buffer.seek(0)
        self.all_chart_infos.append({
            'title': title,
            'description': description,
            'image_buffer': img_buffer
        })
        plt.close(fig_obj) # Close the figure to free up memory and prevent it from displaying multiple times in Jupyter.

    def calculate_pageview_metrics(self) -> dict:
        """
        Calculates pageview-related metrics like pageviews per category,
        average duration, end rate, and a 'Real Interest Score'.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing calculated metrics, text insights, and key numeric data.
        """
        print("\n--- Pageview Metrics & User Interest Analysis ---")
        insights = []
        metrics_data = {}

        # Pageviews per Category
        pageviews_per_category = self.df['section'].value_counts()
        print("\nPageviews per Category:\n", pageviews_per_category.to_string())
        if not pageviews_per_category.empty:
            top_pageview_section = pageviews_per_category.index[0]
            top_pageview_count = pageviews_per_category.iloc[0]
            insights.append(f"**Insight:** The '{top_pageview_section}' section is the most frequently viewed, with {top_pageview_count} pageviews. This strong interest in Investor Relations content suggests our website successfully attracts and serves users seeking financial information, indicating high potential for engagement with institutional investors. [cite: 6, 91]")
            metrics_data['top_pageview_section'] = top_pageview_section
            metrics_data['top_pageview_count'] = int(top_pageview_count)

        
            fig1 = plt.figure(figsize=(12, 7))
            sns.barplot(x=pageviews_per_category.index, y=pageviews_per_category.values, palette='viridis')
            plt.title('Total Pageviews per Section: Investor Relations Dominates Engagement') 
            plt.xlabel('Website Section') 
            plt.ylabel('Total Pageviews')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            self._collect_chart_info(fig1, 'Total Pageviews per Section', f"Bar chart illustrating pageviews across website sections. 'Investor Relations' significantly leads with {top_pageview_count} views, underscoring its central role in user engagement.")

        # Average Duration per Category (excluding end pages)
        non_end_pages = self.df[self.df['time_spent'] > 0]
        avg_duration_per_category = non_end_pages.groupby('section')['time_spent'].mean().sort_values(ascending=False)
        print("\nAverage Duration per Category (non-terminal pages):\n", avg_duration_per_category.to_string())
        if not avg_duration_per_category.empty:
            top_duration_section = avg_duration_per_category.index[0]
            top_duration_time = avg_duration_per_category.iloc[0]
            insights.append(f"**Insight:** Users spend the longest average time ({top_duration_time:.2f} seconds) on pages within the '{top_duration_section}' section. This indicates that content in this area is highly engaging and valuable, or requires more time for users to process, suggesting its importance in communicating complex information. ")
            metrics_data['top_duration_section'] = top_duration_section
            metrics_data['top_duration_time'] = round(top_duration_time, 2)

            fig2 = plt.figure(figsize=(12, 7))
            sns.barplot(x=avg_duration_per_category.index, y=avg_duration_per_category.values, palette='plasma')
            plt.title('Average Time Spent per Section: Deeper Engagement Areas')
            plt.xlabel('Website Section')
            plt.ylabel('Average Time Spent (seconds)')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            self._collect_chart_info(fig2, 'Average Time Spent per Section (excluding exit pages)', f"Bar chart illustrating the average time users spend on each section, excluding exit pages. '{top_duration_section}' shows the highest engagement duration of {top_duration_time:.2f} seconds, indicating deep user interest. ")

        end_pages = self.df[self.df['time_spent'] == 0]
        end_page_count_per_category = end_pages['section'].value_counts()
        total_visits = self.df['id_visit'].nunique()
        
        end_rate_per_category = pd.Series(dtype='float64') 
        if total_visits > 0:
            end_rate_per_category = (end_page_count_per_category / total_visits).sort_values(ascending=False)
        
        print("\nEnd Rate per Category:\n", end_rate_per_category.to_string())
        if not end_rate_per_category.empty:
            highest_end_rate_section = end_rate_per_category.index[0]
            highest_end_rate_value = end_rate_per_category.iloc[0]
            insights.append(f"**Insight:** The '{highest_end_rate_section}' section has the highest end rate ({highest_end_rate_value:.2%})[cite: 37, 91]. [cite_start]This means a significant proportion of user sessions conclude here, indicating users often found the information they sought or reached a natural stopping point, making it a highly effective 'conversion' or 'information fulfillment' point. [cite: 37]")
            metrics_data['highest_end_rate_section'] = highest_end_rate_section
            metrics_data['highest_end_rate_value'] = round(highest_end_rate_value, 4)

        real_interest_score = pd.Series(dtype='float64')
        if not avg_duration_per_category.empty or not end_rate_per_category.empty:
            alpha = 100 
            combined_series = avg_duration_per_category.reindex(self.df['section'].unique(), fill_value=0) \
                            .add(alpha * end_rate_per_category.reindex(self.df['section'].unique(), fill_value=0), fill_value=0)
            real_interest_score = combined_series.sort_values(ascending=False)

        print("\nReal Interest Score:\n", real_interest_score.to_string())
        if not real_interest_score.empty:
            top_interest_section = real_interest_score.index[0]
            top_interest_score_value = real_interest_score.iloc[0]
            insights.append(f"**Insight:** Our 'Real Interest Score', which combines engagement time and session completion, identifies '{top_interest_section}' ({top_interest_score_value:.2f}) [cite: 91] as the section with the highest overall user focus. This suggests it's a critical area for our content strategy, effectively capturing and fulfilling user needs.")
            metrics_data['top_interest_score_section'] = top_interest_section
            metrics_data['top_interest_score_value'] = round(top_interest_score_value, 2)

        return {
            'pageviews_per_category': pageviews_per_category.to_dict(),
            'avg_duration_per_category': avg_duration_per_category.to_dict(),
            'end_page_count_per_category': end_page_count_per_category.to_dict(),
            'end_rate_per_category': end_rate_per_category.to_dict(),
            'real_interest_score': real_interest_score.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }

    def calculate_bounce_rate(self) -> dict:
        """
        Calculates the overall bounce rate and bounce rate per section.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing bounce rates, text insights, and key numeric data.
        """
        print("\n--- Bounce Rate Analysis ---")
        insights = []
        metrics_data = {}
        
        # Check if df is empty before proceeding
        if self.df.empty:
            insights.append("**Warning:** DataFrame is empty. Cannot perform bounce rate analysis.")
            print("Warning: DataFrame is empty. Cannot perform bounce rate analysis.")
            return {
                'overall_bounce_rate': 0.0,
                'section_bounce_rates_df': [],
                'insights': insights,
                'metrics_data': {}
            }

        visit_counts = self.df.groupby('id_visit').size()
        total_visits = visit_counts.count()
        bounces = visit_counts[visit_counts == 1].count()
        overall_bounce_rate = bounces / total_visits if total_visits > 0 else 0
        print(f"Overall Bounce Rate: {overall_bounce_rate:.2%}")
        insights.append(f"**Insight:** The overall website bounce rate is {overall_bounce_rate:.2%}, indicating that a significant percentage of visitors leave after viewing only one page. This suggests potential issues with initial page relevance or user experience, directly impacting our ability to engage new users. [cite: 8, 95]")
        metrics_data['overall_bounce_rate'] = round(overall_bounce_rate, 4)

        section_bounce_rates_dict = {}
        
        # IMPORTANT FIX: Reset index so 'id_visit' becomes a regular column again
        landing_pages = self.df.groupby('id_visit').first().reset_index() 
        
        # Check if landing_pages is empty after grouping
        if landing_pages.empty:
            insights.append("**Warning:** No landing page data found for section-wise bounce rate analysis.")
            print("Warning: No landing page data found for section-wise bounce rate analysis.")
        else:
            for section in self.df['section'].unique():
                # Now 'id_visit' is a column in landing_pages
                visits_starting_in_section = landing_pages[landing_pages['section'] == section]['id_visit']
                total_section_start_visits = visits_starting_in_section.count()

                df_section_visits = self.df[self.df['id_visit'].isin(visits_starting_in_section)]
                
                # Only proceed if there are visits for this section
                if not df_section_visits.empty:
                    section_visit_lengths = df_section_visits.groupby('id_visit').size()
                    section_bounces = (section_visit_lengths == 1).sum()
                    bounce_rate = section_bounces / total_section_start_visits if total_section_start_visits > 0 else 0
                else:
                    bounce_rate = 0 # No visits for this section, so bounce rate is 0

                section_bounce_rates_dict[section] = bounce_rate
            
            bounce_df = pd.DataFrame(list(section_bounce_rates_dict.items()), columns=['section', 'bounce_rate']).sort_values(by='bounce_rate', ascending=False)
            print("\nBounce Rate per Section (based on visits starting in section):\n", bounce_df.to_string())
            
            if not bounce_df.empty:
                highest_bounce_section = bounce_df.iloc[0]['section']
                highest_bounce_value = bounce_df.iloc[0]['bounce_rate']
                # Corrected citation format
                insights.append(f"**Insight:** The '{highest_bounce_section}' section exhibits the highest bounce rate ({highest_bounce_value:.2%})[cite: 9, 95], implying that visitors landing here quickly disengage. [cite_start]This is a crucial area for immediate optimization to prevent early user abandonment. [cite: 9]")
                metrics_data['highest_bounce_section'] = highest_bounce_section
                metrics_data['highest_bounce_value'] = round(highest_bounce_value, 4)
                
                fig3 = plt.figure(figsize=(12, 7))
                sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')
                plt.title('Bounce Rate per Section: Identifying Areas of User Disengagement')
                plt.xlabel('Website Section')
                plt.ylabel('Bounce Rate')
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                # Corrected citation format in chart description
                self._collect_chart_info(fig3, 'Bounce Rate per Section', f"Bar chart showing the bounce rate for each website section. '{highest_bounce_section}' stands out with the highest bounce rate of {highest_bounce_value:.2%}, highlighting where users are leaving our site prematurely. ")
            else:
                insights.append("**Warning:** No valid section data to calculate bounce rates per section.")

        return {
            'overall_bounce_rate': overall_bounce_rate,
            'section_bounce_rates_df': bounce_df.to_dict('records') if 'bounce_df' in locals() else [], # Ensure bounce_df exists
            'insights': insights,
            'metrics_data': metrics_data
        }

    def analyze_download_behavior(self) -> dict:
        """
        Analyzes user download behavior, including download counts by section/subsection
        and download conversion rates. Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing download-related metrics, text insights, and key numeric data.
        """
        print("\n--- Download Behavior Analysis ---")
        insights = []
        metrics_data = {}

        download_pages = self.df[self.df['download_flag'] == True]
        
        download_section_counts = download_pages['section'].value_counts().sort_values(ascending=False)
        print("\nDownload counts by section:\n", download_section_counts.to_string())
        if not download_section_counts.empty:
            top_download_section = download_section_counts.index[0]
            top_download_count = download_section_counts.iloc[0]
            insights.append(f"**Insight:** Downloads are overwhelmingly concentrated in the '{top_download_section}' section with {top_download_count} downloads[cite: 7, 97]. [cite_start]This confirms its status as a high-value content area for users seeking specific resources, driving key conversion actions. [cite: 7]")
            metrics_data['top_download_section'] = top_download_section
            metrics_data['top_download_count'] = int(top_download_count)
            
            fig4 = plt.figure(figsize=(12, 7))
            sns.barplot(x=download_section_counts.head(5).index, y=download_section_counts.head(5).values, palette='Blues_d')
            plt.title('Top 5 Sections by Download Count: Key Content for High-Intent Users')
            plt.xlabel('Website Section')
            plt.ylabel('Number of Downloads')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            self._collect_chart_info(fig4, 'Top 5 Sections by Download Count', f"Bar chart showing the top sections where downloads occur. 'Investor Relations' is the clear leader with {top_download_count} downloads, signifying its importance for users seeking actionable information. ")
        else:
            insights.append("**Warning:** No download events found in the data.")

        download_subsection_counts = download_pages['sub-section'].value_counts().sort_values(ascending=False)
        print("\nDownload counts by sub-section:\n", download_subsection_counts.to_string())
        if not download_subsection_counts.empty:
            top_download_subsection = download_subsection_counts.index[0]
            top_download_subsection_count = download_subsection_counts.iloc[0]
            insights.append(f"**Insight:** Within the download-heavy sections, '{top_download_subsection}' is the primary source of downloads with {top_download_subsection_count} occurrences[cite: 38, 98]. [cite_start]This indicates these specific reports or presentations are highly sought after by our audience, essential for informing their decisions. [cite: 38]")
            metrics_data['top_download_subsection'] = top_download_subsection
            metrics_data['top_download_subsection_count'] = int(top_download_subsection_count)

        download_id_visits = self.df[self.df['download_flag'] == True]['id_visit'].unique()
        related_visits = self.df[self.df['id_visit'].isin(download_id_visits)]
        section_counts_with_downloads = related_visits['section'].value_counts().sort_values(ascending=False)
        print("\nSection counts for visits with download behavior (pages viewed by users who downloaded something during their visit):\n", section_counts_with_downloads.to_string())
        if not section_counts_with_downloads.empty:
            top_section_with_downloads = section_counts_with_downloads.index[0]
            top_section_with_downloads_count = section_counts_with_downloads.iloc[0]
            insights.append(f"**Insight:** Analyzing sections visited by users who eventually download reveals a strong correlation with '{top_section_with_downloads}', further emphasizing its role as a high-intent area. [cite_start]For example, '{top_section_with_downloads}' accounts for {top_section_with_downloads_count} pageviews in visits where a download occurred, showcasing its effectiveness in supporting user journeys towards valuable resources. [cite: 40, 98]")
            metrics_data['top_section_with_downloads'] = top_section_with_downloads
            metrics_data['top_section_with_downloads_count'] = int(top_section_with_downloads_count)

        subsection_counts_with_downloads = related_visits['sub-section'].value_counts().sort_values(ascending=False)
        print("\nSub-section counts for visits with download behavior (pages viewed by users who downloaded something during their visit):\n", subsection_counts_with_downloads.to_string())

        subsection_downloads = download_pages['sub-section'].value_counts()
        subsection_total = self.df['sub-section'].value_counts()
        
        subsection_download_rate = pd.Series(dtype='float64')
        if not subsection_total.empty:
            subsection_downloads_aligned = subsection_downloads.reindex(subsection_total.index, fill_value=0)
            subsection_download_rate = (subsection_downloads_aligned / subsection_total).sort_values(ascending=False).dropna()
            subsection_download_rate = subsection_download_rate.replace([np.inf, -np.inf], np.nan).dropna()


        print("\nDownload conversion rate by sub-section (downloads per pageview of that sub-section):\n", subsection_download_rate.to_string())
        if not subsection_download_rate.empty:
            top_conv_subsection = subsection_download_rate.index[0]
            top_conv_rate = subsection_download_rate.iloc[0]
            insights.append(f"**Insight:** '{top_conv_subsection}' shows an exceptional download conversion rate ({top_conv_rate:.2%}) per pageview[cite: 41, 98]. [cite_start]This indicates that users who interact with this specific sub-section are highly motivated and likely to complete a download, suggesting a very effective call-to-action or content match. [cite: 41]")
            metrics_data['top_conv_subsection'] = top_conv_subsection
            metrics_data['top_conv_rate'] = round(top_conv_rate, 4)

        return {
            'download_section_counts': download_section_counts.to_dict(),
            'download_subsection_counts': download_subsection_counts.to_dict(),
            'section_counts_with_downloads': section_counts_with_downloads.to_dict(),
            'subsection_counts_with_downloads': subsection_counts_with_downloads.to_dict(),
            'subsection_download_rate': subsection_download_rate.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }
    
    def perform_funnel_analysis(self, home_company_section_name: str = "home_page") -> dict:
        """
        Performs a simple funnel analysis from a specified landing section to a download event.
        Generates insights and collects key metrics.
        Args:
            home_company_section_name (str): The name of the landing section to start the funnel.
        Returns:
            dict: A dictionary containing funnel conversion rate, text insights, and key numeric data.
        """
        print(f"\n--- Funnel Analysis (starting from '{home_company_section_name}' company_section) ---")
        insights = []
        metrics_data = {}

        if 'company_section' not in self.df.columns:
            self.df.loc[:, ['company_section', 'company_subsection']] = self.df['url'].apply(
                lambda url: pd.Series(DataLoader.get_company_category(url))
            )
            print("Note: 'company_section' was missing and has been generated for funnel analysis.")

        home_visits_ids = self.df[self.df['company_section'] == home_company_section_name]['id_visit'].unique()
        download_visits_ids = self.df[(self.df['id_visit'].isin(home_visits_ids)) & (self.df['download_flag'] == True)]['id_visit'].unique()
        
        num_home_visits = len(home_visits_ids)
        num_download_visits = len(download_visits_ids)

        print(f"Number of visits with '{home_company_section_name}' as landing company_section: {num_home_visits}")
        print(f"Number of those visits with download: {num_download_visits}")
        
        funnel_rate = num_download_visits / num_home_visits if num_home_visits > 0 else 0
        print(f"Funnel conversion rate ({home_company_section_name} company_section -> Download): {funnel_rate:.2%}")
        insights.append(f"**Insight:** The conversion rate from the '{home_company_section_name}' landing page to a download is {funnel_rate:.2%} ({num_download_visits} downloads from {num_home_visits} home page visits)[cite: 54, 101]. [cite_start]This metric is crucial for evaluating the effectiveness of our initial user journey in guiding visitors towards key conversion actions. [cite: 54]")
        metrics_data['funnel_start_visits'] = int(num_home_visits)
        metrics_data['funnel_converted_visits'] = int(num_download_visits)
        metrics_data['funnel_conversion_rate'] = round(funnel_rate, 4)
        
        return {
            'funnel_conversion_rate': funnel_rate,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def calculate_average_depth(self) -> dict:
        """
        Calculates the average number of pages viewed per visit (session depth).
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing average session depth, text insights, and key numeric data.
        """
        print("\n--- Average Depth per Visit ---")
        insights = []
        metrics_data = {}
        visit_counts = self.df.groupby('id_visit').size()
        avg_depth_per_visit = visit_counts.mean() if not visit_counts.empty else 0
        print(f"Average Depth per Visit: {avg_depth_per_visit:.2f}")
        insights.append(f"**Insight:** The average user session depth is {avg_depth_per_visit:.2f} pages[cite: 47, 104]. [cite_start]This metric indicates that, on average, users view slightly more than two pages per session, suggesting a moderate level of engagement with our content. [cite: 47]")
        metrics_data['avg_depth_per_visit'] = round(avg_depth_per_visit, 2)
        return {
            'avg_depth_per_visit': avg_depth_per_visit,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def analyze_unique_investors(self) -> dict:
        """
        Analyzes the distribution of unique investors across different sections.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing unique investor counts per category, text insights, and key numeric data.
        """
        print("\n--- Unique Investor per Category ---")
        insights = []
        metrics_data = {}
        
        df_filtered = self.df.dropna(subset=['visitor_id'])

        if df_filtered.empty:
            insights.append("**Warning:** No valid 'visitor_id' data found for unique investor analysis.")
            print("No valid 'visitor_id' data found for unique investor analysis.")
            return {'unique_investor_per_category': {}, 'insights': insights, 'metrics_data': {}}

        unique_investor_per_category = df_filtered.groupby('section')['visitor_id'].nunique()
        print(unique_investor_per_category.to_string())
        if not unique_investor_per_category.empty:
            top_unique_investor_section = unique_investor_per_category.idxmax()
            top_unique_investor_count = unique_investor_per_category.max()
            insights.append(f"**Insight:** '{top_unique_investor_section}' attracts the most unique investors ({top_unique_investor_count})[cite: 107]. [cite_start]This reinforces its critical importance for our investor relations efforts and suggests it's a primary point of interest for this key stakeholder segment. [cite: 107]")
            metrics_data['top_unique_investor_section'] = top_unique_investor_section
            metrics_data['top_unique_investor_count'] = int(top_unique_investor_count)
        else:
            insights.append("**Warning:** No unique investor data to analyze per category.")

        return {
            'unique_investor_per_category': unique_investor_per_category.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }
    
    def analyze_sub_section_details(self) -> dict:
        """
        Provides detailed analysis for subsections, including pageviews, end pages,
        average duration, and visitor type distribution for key subsections.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing subsection metrics, text insights, and key numeric data.
        """
        print("\n--- Detailed Sub-section Analysis ---")
        insights = []
        metrics_data = {}
        
        df_filtered_subsection = self.df[self.df['sub-section'].notna() & (self.df['sub-section'].astype(str).str.strip() != '')]

        if df_filtered_subsection.empty:
            insights.append("**Warning:** No valid 'sub-section' data found for detailed analysis.")
            print("No valid 'sub-section' data found for detailed analysis.")
            return {'subsection_pageviews': {}, 'insights': insights, 'metrics_data': {}}

        subsection_pageviews = df_filtered_subsection['sub-section'].value_counts().sort_values(ascending=False)
        print("\nPageviews by sub-section:\n", subsection_pageviews.head(5).to_string())
        if not subsection_pageviews.empty:
            top_sub_pv = subsection_pageviews.index[0]
            top_sub_pv_count = subsection_pageviews.iloc[0]
            insights.append(f"**Insight:** The sub-section '{top_sub_pv}' receives the highest number of pageviews ({top_sub_pv_count})[cite: 38, 110], making it a critical content area that users frequently access and indicating its high relevance.")
            metrics_data['top_sub_pv'] = top_sub_pv
            metrics_data['top_sub_pv_count'] = int(top_sub_pv_count)
        else:
            insights.append("**Warning:** No subsection pageview data.")

        end_pages = df_filtered_subsection[df_filtered_subsection['time_spent'] == 0]
        end_page_count_subsection = end_pages['sub-section'].value_counts().sort_values(ascending=False)
        print("\nEnd page count by sub-section:\n", end_page_count_subsection.head(5).to_string())

        non_end_pages = df_filtered_subsection[df_filtered_subsection['time_spent'] > 0]
        avg_duration_subsection = non_end_pages.groupby('sub-section')['time_spent'].mean().sort_values(ascending=False)
        print("\nAverage duration by sub-section (excluding exit pages):\n", avg_duration_subsection.head(5).to_string())
        if not avg_duration_subsection.empty:
            top_sub_duration = avg_duration_subsection.index[0]
            top_sub_duration_time = avg_duration_subsection.iloc[0]
            insights.append(f"**Insight:** Users spend the most time on average on '{top_sub_duration}' sub-section ({top_sub_duration_time:.2f} seconds)[cite: 109, 110], indicating deep engagement with this specific content and suggesting its high value for our audience.")
            metrics_data['top_sub_duration'] = top_sub_duration
            metrics_data['top_sub_duration_time'] = round(top_sub_duration_time, 2)
        else:
            insights.append("**Warning:** No subsection duration data.")

        if 'Reports & Presentations' in df_filtered_subsection['sub-section'].unique() and 'ultimate_parent_name' in df_filtered_subsection.columns:
            report_visitors = df_filtered_subsection[df_filtered_subsection['sub-section'] == 'Reports & Presentations']['ultimate_parent_name'].value_counts()
            print("\nVisitor type distribution for 'Reports & Presentations' sub-section (Top 3):\n", report_visitors.head(3).to_string())
            if not report_visitors.empty:
                top_report_visitor_type = report_visitors.index[0]
                top_report_visitor_count = report_visitors.iloc[0]
                insights.append(f"**Insight:** For the critical 'Reports & Presentations' sub-section, '{top_report_visitor_type}' is the most frequent visitor type ({top_report_visitor_count} occurrences)[cite: 39, 110]. [cite_start]This clearly shows the content strongly resonates with this specific institutional segment, which is vital for our investor outreach. [cite: 39]")
                metrics_data['top_report_visitor_type'] = top_report_visitor_type
                metrics_data['top_report_visitor_count'] = int(top_report_visitor_count)
            else:
                insights.append("**Warning:** No visitor type data for 'Reports & Presentations' subsection.")
        else:
            insights.append("**Note:** 'Reports & Presentations' sub-section or 'ultimate_parent_name' column not found for specific visitor type analysis.")
            
        return {
            'subsection_pageviews': subsection_pageviews.to_dict(),
            'end_page_count_subsection': end_page_count_subsection.to_dict(),
            'avg_duration_subsection': avg_duration_subsection.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }


    def analyze_session_paths(self) -> dict:
        """
        Analyzes common session paths, including landing pages and transitions between sections.
        Also provides sample breakdowns by investor type and country.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing path analysis metrics, text insights, and key numeric data.
        """
        print("\n--- Session Path Analysis ---")
        insights = []
        metrics_data = {}
        df_sorted = self.df.sort_values(['id_visit', 'timestamp'])
        
        landing_pages = df_sorted.groupby('id_visit').first()
        landing_page_section_counts = landing_pages['section'].value_counts().sort_values(ascending=False)
        print("\nLanding page section counts:\n", landing_page_section_counts.head(5).to_string())
        if not landing_page_section_counts.empty:
            top_landing_section = landing_page_section_counts.index[0]
            top_landing_section_count = landing_page_section_counts.iloc[0]
            insights.append(f"**Insight:** '{top_landing_section}' is the most frequent landing section ({top_landing_section_count} times)[cite: 29, 113]. This indicates it's a primary entry point for users. [cite_start]Optimizing this initial experience is crucial for guiding new visitors into deeper engagement. [cite: 40]")
            metrics_data['top_landing_section'] = top_landing_section
            metrics_data['top_landing_section_count'] = int(top_landing_section_count)
        else:
            insights.append("**Warning:** No landing page section data.")


        landing_page_url_counts = landing_pages['url'].value_counts().sort_values(ascending=False)
        print("\nLanding page URL counts (Top 3):\n", landing_page_url_counts.head(3).to_string())
        if not landing_page_url_counts.empty:
            metrics_data['top_landing_urls'] = landing_page_url_counts.head(3).to_dict()
        else:
            metrics_data['top_landing_urls'] = {}


        df_sorted['prev_section'] = df_sorted.groupby('id_visit')['section'].shift(1)
        transitions = df_sorted.dropna(subset=['prev_section'])
        transitions = transitions[transitions['prev_section'] != transitions['section']]

        transition_counts = transitions.groupby(['prev_section', 'section']).size().sort_values(ascending=False)
        print("\nTransition counts (from_section -> to_section - Top 3):\n", transition_counts.head(3).to_string())
        if not transition_counts.empty:
            top_transition_from = transition_counts.index[0][0]
            top_transition_to = transition_counts.index[0][1]
            top_transition_count = transition_counts.iloc[0]
            insights.append(f"**Insight:** The most common internal navigation path is from '{top_transition_from}' to '{top_transition_to}' ({top_transition_count} occurrences)[cite: 42, 114]. [cite_start]This highlights a key user flow, suggesting these two content areas are often related in a user's research journey and can be further optimized for seamless navigation. [cite: 42]")
            metrics_data['top_transition_path'] = f"{top_transition_from} -> {top_transition_to}"
            metrics_data['top_transition_count'] = int(top_transition_count)
        else:
            insights.append("**Warning:** No significant transitions between sections found.")

        if 'ultimate_parent_name' in self.df.columns:
            investor_section_counts = self.df.groupby(['ultimate_parent_name', 'section']).size().unstack(fill_value=0)
            print("\nInvestor section counts (Sample for top 3 investors):\n", investor_section_counts.head(3).to_string())
            insights.append("**Insight:** Analyzing section views by 'ultimate_parent_name' reveals specific content interests of different institutional investors[cite: 39]. [cite_start]This provides a foundation for tailored engagement strategies and understanding which information resonates with key financial firms. [cite: 39]")
            metrics_data['sample_investor_section_counts'] = investor_section_counts.head(3).to_dict('index')
        else:
            insights.append("**Note:** 'ultimate_parent_name' column not found for investor section analysis.")
            metrics_data['sample_investor_section_counts'] = {}


        if 'location_country' in self.df.columns:
            country_section_counts = self.df.groupby(['location_country', 'section']).size().unstack(fill_value=0)
            print("\nCountry section counts (Sample for top 3 countries):\n", country_section_counts.head(3).to_string())
            insights.append("**Insight:** Geographical analysis of section views (by 'location_country') uncovers distinct regional content preferences[cite: 172]. For example, some countries might prioritize 'News & Press' while others focus on 'Our Business'. [cite_start]This insight is crucial for content localization and geo-targeted marketing campaigns, maximizing regional impact. [cite: 172]")
            metrics_data['sample_country_section_counts'] = country_section_counts.head(3).to_dict('index')
        else:
            insights.append("**Note:** 'location_country' column not found for country section analysis.")
            metrics_data['sample_country_section_counts'] = {}
            
        return {
            'landing_page_section_counts': landing_page_section_counts.to_dict(),
            'landing_page_url_counts': landing_page_url_counts.to_dict(),
            'transition_counts': transition_counts.to_dict(),
            'investor_section_counts': investor_section_counts.to_dict() if 'ultimate_parent_name' in self.df.columns else {},
            'country_section_counts': country_section_counts.to_dict() if 'location_country' in self.df.columns else {},
            'insights': insights,
            'metrics_data': metrics_data
        }
    
    def analyze_time_distribution(self) -> dict:
        """
        Analyzes website traffic distribution by hour of day and day of week.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing time distribution metrics, text insights, and key numeric data.
        """
        print("\n--- Time Distribution Analysis ---")
        insights = []
        metrics_data = {}
        
        if self.df.empty or 'timestamp' not in self.df.columns or not pd.api.types.is_datetime64_any_dtype(self.df['timestamp']):
            insights.append("**Warning:** No valid timestamp data for time distribution analysis.")
            print("No valid timestamp data for time distribution analysis.")
            return {'hourly_counts': {}, 'weekday_counts': {}, 'insights': insights, 'metrics_data': {}}

        hourly_counts = self.df['timestamp'].dt.hour.value_counts().sort_index()
        print("\nVisits by hour:\n", hourly_counts.to_string())
        
        if not hourly_counts.empty:
            fig5 = plt.figure(figsize=(10, 6))
            plt.plot(hourly_counts.index, hourly_counts.values, marker='o')
            plt.xlabel('Hour of Day')
            plt.ylabel('Number of Visits')
            plt.title('Visits by Hour of Day: Optimizing Content Release Times')
            plt.grid(True)
            peak_hour = hourly_counts.idxmax()
            peak_hour_count = hourly_counts.max()
            insights.append(f"**Insight:** The website experiences its highest traffic volume around {peak_hour}:00, with {peak_hour_count} visits[cite: 13, 49, 161]. [cite_start]This aligns with typical business hours, providing an optimal window for releasing new content, conducting website updates, or launching promotional activities to maximize immediate reach and engagement. [cite: 13]")
            metrics_data['peak_hour'] = int(peak_hour)
            metrics_data['peak_hour_count'] = int(peak_hour_count)
            self._collect_chart_info(fig5, 'Visits by Hour of Day', f"Line plot showing website visits per hour of the day. A clear peak is observed around {peak_hour}:00 with {peak_hour_count} visits, indicating the best times for content updates. ")
        else:
            insights.append("**Warning:** No hourly visit data.")

        weekday_counts = self.df['timestamp'].dt.dayofweek.value_counts().sort_index()
        print("\nVisits by weekday (0=Monday):\n", weekday_counts.to_string())
        
        if not weekday_counts.empty:
            fig6 = plt.figure(figsize=(10, 6))
            weekdays_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            sns.barplot(x=weekday_counts.index, y=weekday_counts.values, palette='viridis')
            plt.xlabel('Day of Week (0=Monday)')
            plt.ylabel('Number of Visits')
            plt.title('Visits by Weekday: Strategic Scheduling for Maximum Impact')
            plt.xticks(range(len(weekdays_labels)), weekdays_labels)
            plt.grid(axis='y')
            peak_weekday_index = weekday_counts.idxmax()
            peak_weekday_count = weekday_counts.max()
            insights.append(f"**Insight:** Traffic is highest on {weekdays_labels[peak_weekday_index]}, with {peak_weekday_count} visits[cite: 50, 161]. [cite_start]This strong weekday focus, likely during professional hours, emphasizes the importance of aligning our content and outreach efforts with these active periods to capture maximum audience attention. [cite: 50]")
            metrics_data['peak_weekday'] = weekdays_labels[peak_weekday_index]
            metrics_data['peak_weekday_count'] = int(peak_weekday_count)
            self._collect_chart_info(fig6, 'Visits by Weekday', f"Bar chart illustrating website visits per day of the week. {weekdays_labels[peak_weekday_index]} shows the highest traffic with {peak_weekday_count} visits. ")
        else:
            insights.append("**Warning:** No weekday visit data.")

        return {
            'hourly_counts': hourly_counts.to_dict(),
            'weekday_counts': weekday_counts.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }
    
    def generate_visitor_profiles(self) -> dict:
        """
        Generates comprehensive behavioral profiles for each unique visitor_id,
        including aggregated metrics and derived behavioral tags.
        Returns:
            dict: A dictionary containing the visitor profiles DataFrame,
                  text insights about overall user types, and key numeric data.
        """
        print("\n--- Generating Visitor Profiles (for individual user stories & clustering) ---")
        df_profiling = self.df.copy()
        insights = []
        metrics_data = {}

        df_profiling = df_profiling.dropna(subset=['visitor_id'])
        if df_profiling.empty:
            warning_msg = "**Warning:** No valid visitor_ids found for profiling. Skipping visitor profile generation."
            print(warning_msg)
            insights.append(warning_msg)
            empty_profile_df = pd.DataFrame(columns=['visitor_id'] + self.CLUSTERING_FEATURES + ['top_visited_section', 'user_tags'])
            return {
                'visitor_profiles_df': empty_profile_df,
                'insights': insights,
                'metrics_data': {}
            }
        
        all_visitor_ids = df_profiling['visitor_id'].unique()

        session_depth = df_profiling.groupby(['visitor_id', 'id_visit']).size().groupby('visitor_id').mean().rename('avg_session_depth')
        download_count = df_profiling[df_profiling['download_flag'] == True].groupby('visitor_id').size().rename('download_count').reindex(all_visitor_ids, fill_value=0)
        
        top_section_raw = df_profiling.groupby(['visitor_id', 'section']).size().reset_index(name='count')
        top_section_raw = top_section_raw.sort_values(['visitor_id', 'count'], ascending=[True, False])
        top_section = top_section_raw.groupby('visitor_id').first()['section'].rename('top_visited_section')
        top_section = top_section.reindex(all_visitor_ids, fill_value='No_Section_Determined')


        ir_score = df_profiling[df_profiling['section'].str.contains('Investor Relations', na=False, case=False)].groupby('visitor_id').size().rename('investor_interest_score').reindex(all_visitor_ids, fill_value=0)
        content_breadth = df_profiling.groupby('visitor_id')['section'].nunique().rename('content_breadth').reindex(all_visitor_ids, fill_value=0)
        visit_count_series = df_profiling.groupby('visitor_id')['id_visit'].nunique().rename('visit_count').reindex(all_visitor_ids, fill_value=0)
        is_repeat_visitor = (visit_count_series > 1).astype(int).rename('is_repeat_visitor').reindex(all_visitor_ids, fill_value=0)
        
        has_download = (download_count > 0).astype(int).rename('has_download').reindex(all_visitor_ids, fill_value=0)
        has_ir = (ir_score >= 1).astype(int).rename('has_ir').reindex(all_visitor_ids, fill_value=0)
        
        esg_visitors_in_profile = df_profiling[df_profiling['section'].str.contains('ESG', na=False, case=False)]['visitor_id'].unique()
        esg_visitor_flag = pd.Series(all_visitor_ids).isin(esg_visitors_in_profile).astype(int).set_axis(all_visitor_ids).rename('esg_visitor')
        
        visitor_profiles_df = pd.concat([
            session_depth, download_count, top_section, ir_score, content_breadth,
            visit_count_series, is_repeat_visitor, has_download, has_ir, esg_visitor_flag
        ], axis=1).reset_index(names=['visitor_id'])
        
        for col in self.CLUSTERING_FEATURES: 
            if col in visitor_profiles_df.columns:
                visitor_profiles_df[col] = pd.to_numeric(visitor_profiles_df[col], errors='coerce').fillna(0)
            else:
                visitor_profiles_df[col] = 0 

        visitor_profiles_df['is_high_intent'] = ((visitor_profiles_df['has_download'] > 0) & (visitor_profiles_df['has_ir'] > 0)).astype(int)
        visitor_profiles_df['ir_only_visitor'] = ((visitor_profiles_df['content_breadth'] == 1) & (visitor_profiles_df['top_visited_section'] == 'Investor Relations')).astype(int)
        visitor_profiles_df['frequent_downloader'] = (visitor_profiles_df['download_count'] > 3).astype(int)
        visitor_profiles_df['deep_path_visitor'] = (visitor_profiles_df['avg_session_depth'] > 5).astype(int)

        def get_user_tags(row):
            tags = []
            if row['ir_only_visitor']:
                tags.append('IR-Only Browser')
            if row['frequent_downloader']:
                tags.append('Frequent Downloader')
            if row['deep_path_visitor']:
                tags.append('Deep Path Visitor')
            if row['esg_visitor'] > 0:
                tags.append('ESG-Focused')
            if row['avg_session_depth'] <= 1.1 and row['avg_session_depth'] > 0:
                tags.append('High Bounce Rate Tendency')
            
            if row['is_repeat_visitor']:
                tags.append('Returning Visitor')
            else:
                tags.append('New Visitor')

            return ', '.join(tags) if tags else 'General User'

        visitor_profiles_df['user_tags'] = visitor_profiles_df.apply(get_user_tags, axis=1)
        
        visitor_profiles_df['visitor_id'] = visitor_profiles_df['visitor_id'].astype(int)

        print("\n--- Generated Visitor Profiles (Sample for top 5 visitor_ids) ---")
        print(visitor_profiles_df.head().to_string())
        
        num_high_intent = visitor_profiles_df['is_high_intent'].sum()
        num_frequent_downloaders = visitor_profiles_df['frequent_downloader'].sum()
        num_ir_only = visitor_profiles_df['ir_only_visitor'].sum()
        num_esg_visitors = visitor_profiles_df['esg_visitor'].sum()

        insights.append(f"**Insight:** The visitor profiling reveals diverse user segments based on their engagement patterns, such as {num_frequent_downloaders} 'Frequent Downloaders' [cite: 12, 178][cite_start], {num_esg_visitors} 'ESG-Focused' users[cite: 12, 178], and {num_high_intent} 'High-Intent Investors' (downloaders with IR interest). [cite_start]This comprehensive segmentation allows for highly targeted content strategies and personalized outreach efforts. [cite: 11]")
        metrics_data['num_high_intent_users'] = int(num_high_intent)
        metrics_data['num_frequent_downloaders'] = int(num_frequent_downloaders)
        metrics_data['num_ir_only_visitors'] = int(num_ir_only)
        metrics_data['num_esg_visitors'] = int(num_esg_visitors)
        
        return {
            'visitor_profiles_df': visitor_profiles_df,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def perform_clustering_and_profiling(self, n_clusters: int = 5) -> dict:
        """
        Performs K-Means clustering on generated visitor profiles to identify distinct user segments.
        Generates typical user profiles for each cluster and visualizes the clusters.
        Args:
            n_clusters (int): The number of clusters to form.
        Returns:
            dict: A dictionary containing the profiles DataFrame with cluster labels,
                  cluster descriptions, text insights, and key numeric data.
        """
        print(f"\n--- Visitor Clustering Analysis (K-Means, {n_clusters} clusters) ---")
        insights = []
        metrics_data = {'cluster_summary': {}}

        visitor_profiles_results = self.generate_visitor_profiles()
        visitor_profiles_df = visitor_profiles_results['visitor_profiles_df']
        insights.extend(visitor_profiles_results.get('insights', []))

        # Define features for clustering. Ensure all are numeric and handle potential NaNs.
        X = visitor_profiles_df[self.CLUSTERING_FEATURES].copy()
        X = X.fillna(0) # Fill any remaining NaNs after previous steps

        # --- FIX: Check if there are enough samples to perform clustering ---
        if X.empty or X.shape[0] < n_clusters or X.shape[0] < 2: # Need at least 2 samples for TSNE even if n_clusters=1
            warning_msg = f"**Warning:** Not enough valid visitor profiles ({X.shape[0]}) to perform clustering with {n_clusters} clusters. Skipping clustering."
            print(warning_msg)
            insights.append(warning_msg)
            return {
                'visitor_profiles_with_clusters': visitor_profiles_df, # Still return the df, just without clusters
                'cluster_descriptions': {},
                'insights': insights,
                'metrics_data': metrics_data
            }
            
        # Check if all features are zero for all visitors. This can happen if only 'Other' sections exist, etc.
        # Also, check if there's enough variance for clustering
        if (X == 0).all().all() or (X.nunique() <= 1).all(): # If all columns have only one unique value (e.g., all 0s or all 1s)
            warning_msg = "**Warning:** All numerical features for all visitor profiles are zero or have no variance. Cannot perform meaningful clustering. Skipping clustering."
            print(warning_msg)
            insights.append(warning_msg)
            return {
                'visitor_profiles_with_clusters': visitor_profiles_df,
                'cluster_descriptions': {},
                'insights': insights,
                'metrics_data': metrics_data
            }

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Handle cases where StandardScaler might result in NaNs/Infs (e.g., if a feature has zero variance)
        if np.isnan(X_scaled).any() or np.isinf(X_scaled).any():
            warning_msg = "**Warning:** Scaling resulted in NaN or Inf values. This usually means some features have zero variance. Attempting to filter such features."
            print(warning_msg)
            insights.append(warning_msg)
            
            # Identify columns with zero variance and remove them from X
            cols_to_keep = X.columns[X.std() > 1e-9] # Keep columns with standard deviation > a very small number
            if cols_to_keep.empty:
                warning_msg = "**Warning:** After removing zero-variance features, no features remain for clustering. Skipping clustering."
                print(warning_msg)
                insights.append(warning_msg)
                return {
                    'visitor_profiles_with_clusters': visitor_profiles_df,
                    'cluster_descriptions': {},
                    'insights': insights,
                    'metrics_data': metrics_data
                }
            X = X[cols_to_keep]
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            if X_scaled.shape[1] == 0: # Check again after re-scaling
                 warning_msg = "**Warning:** No valid features left for clustering after standardization. Skipping clustering."
                 print(warning_msg)
                 insights.append(warning_msg)
                 return {
                    'visitor_profiles_with_clusters': visitor_profiles_df,
                    'cluster_descriptions': {},
                    'insights': insights,
                    'metrics_data': metrics_data
                }


        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
        visitor_profiles_df['cluster'] = kmeans.fit_predict(X_scaled)
        
        print("\n--- Cluster Sizes ---")
        cluster_sizes = visitor_profiles_df['cluster'].value_counts().sort_index()
        print(cluster_sizes.to_string())
        
        cluster_centers_scaled = kmeans.cluster_centers_
        cluster_centers = pd.DataFrame(scaler.inverse_transform(cluster_centers_scaled), columns=X.columns)
        
        print("\n--- Cluster Characteristics (Mean Values) ---")
        print(cluster_centers.to_string())

        cluster_descriptions = {}        
        for i in range(n_clusters):
            cluster_data = cluster_centers.iloc[i]
            desc_parts = []
            
            # ** 引入更具描述性的 Cluster 命名 **
            cluster_name = f"Cluster {i}"
            if i == 0:
                cluster_name = "Highly Engaged Core Investors"
            elif i == 1:
                cluster_name = "Low-Engagement New Visitors"
            elif i == 2:
                cluster_name = "Deep-Dive Potential Investors"
            elif i == 3:
                cluster_name = "Frequent Professional Downloaders"
            elif i == 4:
                cluster_name = "IR-Only Browsers"

            
            # Core statistics
            desc_parts.append(f"Avg session depth: {cluster_data.get('avg_session_depth', 0.0):.1f} pages") # [cite: 181, 184, 186, 190, 193]
            desc_parts.append(f"Avg downloads: {cluster_data.get('download_count', 0.0):.1f} times") # [cite: 181, 184, 186, 190, 193]
            desc_parts.append(f"Avg IR interest score: {cluster_data.get('investor_interest_score', 0.0):.1f}") # [cite: 181, 184, 186, 190, 193]
            desc_parts.append(f"Content breadth: {cluster_data.get('content_breadth', 0.0):.1f} sections") # [cite: 182, 185, 187, 191, 194]
            desc_parts.append(f"Total visits: {cluster_data.get('visit_count', 0.0):.1f} times") # [cite: 182, 185, 187, 191, 194]

            # Derived boolean tags (using a threshold to handle float inaccuracies from centroids)
            if cluster_data.get('is_repeat_visitor', 0.0) > 0.5:
                desc_parts.append("Primarily **repeat visitors**") # [cite: 182, 187, 191]
            else:
                desc_parts.append("Primarily **new visitors**") # [cite: 185, 194]

            if cluster_data.get('has_download', 0.0) > 0.5:
                desc_parts.append("Shows **download behavior**") # [cite: 182, 187, 191]
            if cluster_data.get('frequent_downloader', 0.0) > 0.5:
                desc_parts.append("Are **frequent downloaders**") # [cite: 183, 192]
                
            if cluster_data.get('has_ir', 0.0) > 0.5:
                desc_parts.append("Highly interested in **IR content**") # [cite: 183, 188, 192, 194]
            if cluster_data.get('ir_only_visitor', 0.0) > 0.5:
                desc_parts.append("Are **IR-only browsers**") # [cite: 195]
                
            if cluster_data.get('esg_visitor', 0.0) > 0.01: # Small threshold for ESG flag
                desc_parts.append("Focused on **ESG content**") # [cite: 12]
                
            if cluster_data.get('is_high_intent', 0.0) > 0.5:
                desc_parts.append("Exhibit **high intent**") # [cite: 183, 188, 192]
                
            if cluster_data.get('deep_path_visitor', 0.0) > 0.5:
                desc_parts.append("Are **deep path visitors**") # [cite: 188]
            elif cluster_data.get('avg_session_depth', 0.0) <= 1.1 and cluster_data.get('avg_session_depth', 0.0) > 0:
                desc_parts.append("Tend to have a **high bounce rate**")

            top_section_mode = visitor_profiles_df[visitor_profiles_df['cluster'] == i]['top_visited_section'].mode()
            if not top_section_mode.empty:
                desc_parts.append(f"Most frequently visited section: '{top_section_mode.iloc[0]}'") # [cite: 183, 185, 189, 192, 195]

            
            full_description = f"**{cluster_name}:** " + " | ".join(desc_parts) + "."
            if i == 0: 
                full_description += " These are our most valuable users, consistently engaging with and downloading Investor Relations content. We should continue to provide premium content and consider exclusive reports to further solidify their loyalty."
            elif i == 1: 
                full_description += " This group represents new visitors with minimal engagement. A key focus should be on optimizing their initial experience, such as improving homepage clarity and blog content, to encourage deeper exploration and reduce bounce rates."
            elif i == 2: 
                full_description += " These repeat visitors engage deeply and download content, indicating a strong investigative interest. We should provide easy navigation to related in-depth analysis and perhaps offer personalized content recommendations to guide their research."
            elif i == 3: 
                full_description += " This segment actively downloads multiple files, suggesting a professional need for our data and reports. Ensuring easy access, clear categorization, and timely updates of downloadable resources is paramount for this group."
            elif i == 4: 
                full_description += " These users are exclusively interested in Investor Relations content. While serving their direct needs, we might gently suggest other relevant sections (e.g., ESG if applicable) through subtle cross-promotion to broaden their engagement."

            cluster_descriptions[f'Cluster {i}'] = full_description
            
            cluster_info_text = f"**Cluster {i} (N={cluster_sizes.get(i,0)} users):** {full_description}"
            print(cluster_info_text)
            insights.append(f"**Insight:** User cluster analysis identifies Cluster {i} as: {full_description}")
            metrics_data['cluster_summary'][f'Cluster {i}'] = cluster_data.to_dict()


        # ... (rest of the method, including t-SNE visualization, remains unchanged) ...
        try:
            # Ensure enough samples and features for TSNE
            # X_scaled might have fewer features if some were removed due to zero variance
            if X_scaled.shape[0] > 1 and X_scaled.shape[1] >= 2: # Changed from >2 to >=2 to allow 2D data
                tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, X_scaled.shape[0]-1))
                X_tsne = tsne.fit_transform(X_scaled)
                visitor_profiles_df['tsne_x'] = X_tsne[:, 0]
                visitor_profiles_df['tsne_y'] = X_tsne[:, 1]

                fig12 = plt.figure(figsize=(10, 8))
                sns.scatterplot(
                    x='tsne_x', y='tsne_y', hue='cluster',
                    palette='tab10', legend='full',
                    data=visitor_profiles_df
                )
                plt.title(f'Visitor Clusters (t-SNE visualization, {n_clusters} clusters)')
                plt.xlabel('t-SNE Component 1')
                plt.ylabel('t-SNE Component 2')
                plt.legend(title='Cluster')
                plt.tight_layout()
                self._collect_chart_info(fig12, f'Visitor Clusters (t-SNE visualization, {n_clusters} clusters)', f"2D scatter plot visualizing {n_clusters} visitor clusters after t-SNE dimensionality reduction. Different colors represent different clusters.")
            else:
                insights.append("**Warning:** Not enough data points or features (must be >1 sample, >=2 features for t-SNE if input has >0 variance) to perform t-SNE visualization for clustering.")
        except Exception as e:
            insights.append(f"**Error:** Could not visualize clusters using t-SNE: {e}. This might be due to very small clusters, specific data characteristics, or all features having zero variance after selection.")

        return {
            'visitor_profiles_with_clusters': visitor_profiles_df,
            'cluster_descriptions': cluster_descriptions, # Pass the refined descriptions
            'insights': insights,
            'metrics_data': metrics_data
        }
    
    def analyze_session_path_length_and_repeat_visitors(self) -> dict:
        """
        Analyzes session path lengths and differentiates between new and returning visitors.
        Generates insights, collects key metrics, and creates visualizations.
        Returns:
            dict: A dictionary containing session depth/repeat visitor metrics, text insights, and key numeric data.
        """
        print("\n--- Session Path Length & Repeat Visitors Analysis ---")
        insights = []
        metrics_data = {}

        # Session Path Length Analysis: Number of pages viewed per visit.
        visit_counts = self.df.groupby('id_visit').size()
        # Filter out extremely long sessions (top 1%) as outliers to focus on typical behavior.
        session_length_threshold = visit_counts.quantile(0.99) if not visit_counts.empty else 0
        normal_sessions = visit_counts[visit_counts <= session_length_threshold] if not visit_counts.empty else pd.Series(dtype='int64') # Ensure empty series has a dtype
        print("\nNormal Session Path Length (pages per visit):\n", normal_sessions.describe().to_string())
        metrics_data['normal_sessions_mean'] = round(normal_sessions.mean(), 2) if not normal_sessions.empty else 0.0
        metrics_data['normal_sessions_max'] = int(normal_sessions.max()) if not normal_sessions.empty else 0
        
        # Bin session lengths for easier interpretation (e.g., 1 page, 2-3 pages, etc.).
        if not normal_sessions.empty and normal_sessions.max() >= 1: # Ensure max is at least 1 for binning
            bins = [1, 2, 4, 11, int(normal_sessions.max()) + 1]
            labels = ['1', '2-3', '4-10', '11+']
            # Ensure bins are unique and sorted if normal_sessions is too small
            unique_bins = sorted(list(set(bins)))
            if len(unique_bins) < 2: # Handle case where all values are the same or very few unique
                unique_bins = [normal_sessions.min() - 0.1, normal_sessions.max() + 0.1]
                labels = [str(int(normal_sessions.min()))] # Fallback label

            visit_depth_group = pd.cut(normal_sessions, bins=unique_bins, right=False, labels=labels[:len(unique_bins)-1], include_lowest=True)
            depth_dist = visit_depth_group.value_counts().sort_index()
        else:
            depth_dist = pd.Series(dtype='int64')

        print("\nSession Path Length (binned, filtered):\n", depth_dist.to_string())

        # Visualize session path length distribution (bar plot).
        if not depth_dist.empty:
            fig7 = plt.figure(figsize=(10, 6))
            depth_dist.plot(kind='bar')
            plt.xlabel('Pages per Visit')
            plt.ylabel('Number of Visits')
            plt.title('Session Path Length Distribution (Binned, Filtered)')
            plt.xticks(rotation=0)
            plt.grid(axis='y')
            self._collect_chart_info(fig7, 'Session Path Length Distribution (Binned, Filtered)', f"Bar chart showing the distribution of session path lengths (pages per visit) after filtering outliers. The majority of visits ({depth_dist.get('1', 0)}) are single-page sessions.")
        else:
            insights.append("**Warning:** No data to generate binned session path length distribution chart.")


        # Visualize session path length distribution (histogram with log scale y-axis).
        if not normal_sessions.empty:
            fig8 = plt.figure(figsize=(10, 6))
            # Ensure there's enough range for bins
            if normal_sessions.max() > normal_sessions.min():
                 plt.hist(normal_sessions, bins=50, edgecolor='black', log=True)
            else: # Handle case where all values are the same
                 plt.hist(normal_sessions, bins=1, edgecolor='black', log=True)
            plt.xlabel('Pages per Visit')
            plt.ylabel('Number of Visits (log scale)')
            plt.title('Session Path Length Distribution (Log Scale, Filtered)')
            plt.grid(True)
            self._collect_chart_info(fig8, 'Session Path Length Distribution (Log Scale, Filtered)', "Histogram showing the distribution of session path lengths on a logarithmic scale, highlighting a long tail of highly engaged sessions beyond a few pages.")
        else:
            insights.append("**Warning:** No data to generate log-scale session path length distribution chart.")


        # Count short visits (1 page) and deep visits (>= 5 pages).
        short_visits = (normal_sessions == 1).sum() if not normal_sessions.empty else 0
        deep_visits = (normal_sessions >= 5).sum() if not normal_sessions.empty else 0
        print(f"Short visits (1 page, filtered): {short_visits}")
        print(f"Deep visits (>= 5 pages, filtered): {deep_visits}")
        insights.append(f"**Insight:** A high number of short visits ({short_visits}) combined with a tail of deep visits ({deep_visits}) indicates a mix of quick information seekers and highly engaged users. This duality requires different engagement strategies.")
        metrics_data['short_visits_count'] = int(short_visits)
        metrics_data['deep_visits_count'] = int(deep_visits)

        # Repeat Visitors Analysis: Analyze how many times unique visitors return.
        visitor_visit_counts = self.df.groupby('visitor_id')['id_visit'].nunique()
        # Filter out extreme outliers (top 1%)
        visit_count_threshold = visitor_visit_counts.quantile(0.99) if not visitor_visit_counts.empty else 0
        normal_visitors = visitor_visit_counts[visitor_visit_counts <= visit_count_threshold] if not visitor_visit_counts.empty else pd.Series(dtype='int64')
        print("\nNormal Repeat Visitors (visits per visitor):\n", normal_visitors.describe().to_string())
        repeat_visitors_count = (normal_visitors > 1).sum() if not normal_visitors.empty else 0
        print(f"Number of repeat visitors (filtered): {repeat_visitors_count}")
        insights.append(f"**Insight:** The site has {repeat_visitors_count} repeat visitors (filtered to exclude outliers), which is a positive sign of user retention. Cultivating this returning audience is key for long-term engagement.")
        metrics_data['repeat_visitors_count'] = int(repeat_visitors_count)

        # Visualize repeat visitor distribution.
        if not normal_visitors.empty:
            fig9 = plt.figure(figsize=(10, 6))
            plt.hist(normal_visitors, bins=range(1, int(normal_visitors.max()) + 2), edgecolor='black')
            plt.xlabel('Visits per Visitor')
            plt.ylabel('Number of Visitors')
            plt.title('Repeat Visitor Distribution (Filtered)')
            plt.grid(axis='y')
            self._collect_chart_info(fig9, 'Repeat Visitor Distribution (Filtered)', "Histogram displaying the distribution of visits per unique visitor, after removing outliers. Shows how many visitors return multiple times.")
        else:
            insights.append("**Warning:** No data to generate repeat visitor distribution chart.")


        # New vs. Returning Visitors Analysis: Compare behavior of first-time vs. repeat visitors.
        # NOW DIRECTLY USE THE 'is_new_visitor_session' COLUMN CREATED IN DataLoader
        new_visits = self.df[self.df['is_new_visitor_session']]
        returning_visits = self.df[~self.df['is_new_visitor_session']]

        new_visits_unique_count = new_visits['id_visit'].nunique()
        returning_visits_unique_count = returning_visits['id_visit'].nunique()
        print(f"\nNew visits: {new_visits_unique_count}")
        print(f"Returning visits: {returning_visits_unique_count}")

        new_depth = new_visits.groupby('id_visit').size().mean() if new_visits_unique_count > 0 else 0
        returning_depth = returning_visits.groupby('id_visit').size().mean() if returning_visits_unique_count > 0 else 0

        print(f"Average depth (new): {new_depth:.2f}")
        print(f"Average depth (returning): {returning_depth:.2f}")
        insights.append(f"**Insight:** Returning visitors ({returning_visits_unique_count} visits, average depth {returning_depth:.2f} pages) show significantly deeper engagement compared to new visitors ({new_visits_unique_count} visits, average depth {new_depth:.2f} pages). This highlights a critical need to improve the onboarding and initial content experience for new users to encourage them to delve deeper into the site.")
        metrics_data['new_visits_count'] = int(new_visits_unique_count)
        metrics_data['returning_visits_count'] = int(returning_visits_unique_count)
        metrics_data['new_depth'] = round(new_depth, 2)
        metrics_data['returning_depth'] = round(returning_depth, 2)

        # Visualize average session depth comparison between new and returning visitors.
        if new_visits_unique_count > 0 or returning_visits_unique_count > 0: # Only plot if there's data for at least one group
            fig10 = plt.figure(figsize=(8, 6))
            sns.barplot(x=['New Visitors', 'Returning Visitors'], y=[new_depth, returning_depth], palette='coolwarm')
            plt.title('Average Session Depth: New vs. Returning Visitors')
            plt.xlabel('Visitor Type')
            plt.ylabel('Average Pages per Visit')
            plt.tight_layout()
            self._collect_chart_info(fig10, 'Average Session Depth: New vs. Returning Visitors', f"Bar chart comparing the average session depth (pages per visit) between new and returning visitors. Returning visitors show higher engagement ({returning_depth:.2f} pages) compared to new visitors ({new_depth:.2f} pages).")
        else:
            insights.append("**Warning:** No data to compare new vs. returning visitor session depth.")

        return {
            'normal_sessions_describe': normal_sessions.describe().to_dict() if not normal_sessions.empty else {},
            'depth_dist': depth_dist.to_dict(),
            'short_visits': short_visits,
            'deep_visits': deep_visits,
            'normal_visitors_describe': normal_visitors.describe().to_dict() if not normal_visitors.empty else {},
            'repeat_visitors_count': repeat_visitors_count,
            'new_visits_count': new_visits_unique_count,
            'returning_visits_count': returning_visits_unique_count,
            'new_depth': new_depth,
            'returning_depth': returning_depth,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def compare_new_vs_returning(self):
        """
        Compares key behavior metrics (avg session depth, bounce rate, top sections)
        between new and returning visitors.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing comparison metrics, text insights, and key numeric data.
        """
        print("\n--- Comparison: New vs. Returning Visitors ---")
        insights = []
        metrics_data = {}

        # Ensure 'is_new_visitor' flag is present (re-calculate if needed).
        if 'is_new_visitor' not in self.df.columns:
            df_filtered_visitors = self.df.dropna(subset=['visitor_id'])
            first_visit_time = df_filtered_visitors.groupby('visitor_id')['timestamp'].min()
            self.df.loc[:, 'is_new_visitor'] = self.df.apply(
                lambda row: row['timestamp'] == first_visit_time[row['visitor_id']] if pd.notna(row['visitor_id']) else False,
                axis=1
            )

        new_visits_df = self.df[self.df['is_new_visitor_session']]
        returning_visits_df = self.df[~self.df['is_new_visitor_session']]

        # Compare average session depth.
        # Ensure that if there are no visits, the mean is 0 to prevent errors
        new_depth = new_visits_df.groupby('id_visit').size().mean() if not new_visits_df.empty else 0
        returning_depth = returning_visits_df.groupby('id_visit').size().mean() if not returning_visits_df.empty else 0
        print(f"Average Session Depth - New: {new_depth:.2f}, Returning: {returning_depth:.2f}")
        insights.append(f"**Insight:** Returning visitors (avg. depth {returning_depth:.2f} pages) demonstrate significantly deeper engagement than new visitors (avg. depth {new_depth:.2f} pages), suggesting that familiarity with the site leads to more extensive exploration.")
        metrics_data['new_depth_comparison'] = round(new_depth, 2)
        metrics_data['returning_depth_comparison'] = round(returning_depth, 2)

        # Compare bounce rates (creating temporary Analyzer instances to avoid state issues).
        print("\n--- New Visitors Bounce Rate ---")
        # Instantiate a new BehaviorAnalyzer for new_visits_df to calculate bounce rate specifically for them.
        new_bounce_result = BehaviorAnalyzer(new_visits_df).calculate_bounce_rate()
        print("\n--- Returning Visitors Bounce Rate ---")
        # Instantiate another BehaviorAnalyzer for returning_visits_df.
        returning_bounce_result = BehaviorAnalyzer(returning_visits_df).calculate_bounce_rate()
        
        print(f"Overall Bounce Rate - New: {new_bounce_result['overall_bounce_rate']:.2%}, Returning: {returning_bounce_result['overall_bounce_rate']:.2%}")
        insights.append(f"**Insight:** New visitors have a higher overall bounce rate ({new_bounce_result['overall_bounce_rate']:.2%}) compared to returning visitors ({returning_bounce_result['overall_bounce_rate']:.2%}). This is expected but highlights an opportunity to improve the initial experience for new users, potentially through clearer navigation or more compelling above-the-fold content.")
        metrics_data['new_bounce_rate_comparison'] = round(new_bounce_result['overall_bounce_rate'], 4)
        metrics_data['returning_bounce_rate_comparison'] = round(returning_bounce_result['overall_bounce_rate'], 4)

        # Compare top visited sections.
        new_top_sections = new_visits_df['section'].value_counts(normalize=True).head(3) if not new_visits_df.empty else pd.Series(dtype='float64')
        returning_top_sections = returning_visits_df['section'].value_counts(normalize=True).head(3) if not returning_visits_df.empty else pd.Series(dtype='float64')

        print("The following section contains the top 3 visited sections for New and Returning Visitors.")
        print("This data is useful for understanding initial content interests and how they evolve.")
        print("\nTop 3 Visited Sections (Proportion) - New Visitors:\n", new_top_sections.to_string())
        print("\nTop 3 Visited Sections (Proportion) - Returning Visitors:\n", returning_top_sections.to_string())

        # Safely access top sections for insights
        new_top_section_text = 'N/A'
        new_top_section_proportion = 0.0
        if not new_top_sections.empty:
            new_top_section_text = new_top_sections.index[0]
            new_top_section_proportion = new_top_sections.iloc[0]

        returning_top_section_text = 'N/A'
        returning_top_section_proportion = 0.0
        if not returning_top_sections.empty:
            returning_top_section_text = returning_top_sections.index[0]
            returning_top_section_proportion = returning_top_sections.iloc[0]


        insights.append(f"**Insight:** There are differences in the top visited sections between new and returning visitors. For example, new visitors' top section is '{new_top_section_text}' ({new_top_section_proportion:.2%}) while returning visitors' is '{returning_top_section_text}' ({returning_top_section_proportion:.2%}). This suggests new users might focus on introductory content, while returning users delve into more specific or deeper sections.")
        metrics_data['new_top_sections_comparison'] = new_top_sections.to_dict()
        metrics_data['returning_top_sections_comparison'] = returning_top_sections.to_dict()

        return {
            'new_vs_returning_depth': {'new': new_depth, 'returning': returning_depth},
            'new_vs_returning_bounce_rate': {'new': new_bounce_result['overall_bounce_rate'], 'returning': returning_bounce_result['overall_bounce_rate']},
            'new_top_sections': new_top_sections.to_dict(),
            'returning_top_sections': returning_top_sections.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data # Ensure this is returned with all keys.
        }

    def compare_by_country(self, top_n_countries=5):
        """
        Compares key behavior metrics (bounce rate, avg session depth, top sections)
        across the top N countries by pageviews.
        Generates insights and collects key metrics.
        Args:
            top_n_countries (int): Number of top countries to analyze.
        Returns:
            dict: A dictionary containing country comparison metrics, text insights, and key numeric data.
        """
        print(f"\n--- Comparison: Behavior by Top {top_n_countries} Countries ---")
        insights = []
        metrics_data = {}

        # Identify top N countries by total pageviews.
        country_pageviews = self.df.groupby('location_country')['id_visit'].count().sort_values(ascending=False)
        top_countries = country_pageviews.head(top_n_countries).index.tolist()
        print(f"\nTop {top_n_countries} Countries by Pageviews: {top_countries}")
        insights.append(f"**Insight:** The analysis focuses on the top {top_n_countries} countries by pageviews, which are {', '.join(top_countries)}, representing the primary geographical audience for the website.")
        metrics_data['top_countries_list'] = top_countries
        metrics_data['top_countries_pageviews'] = country_pageviews.head(top_n_countries).to_dict()

        country_bounce_rates = {}
        country_avg_depth = {}
        country_top_sections = {}
        
        # Loop through top countries and perform sub-analysis for each.
        for country in top_countries:
            country_df = self.df[self.df['location_country'] == country]
            if not country_df.empty:
                # Use a temporary Analyzer instance for country-specific analysis.
                country_analyzer = BehaviorAnalyzer(country_df)
                # Calculate bounce rate for the country.
                country_bounce_result = country_analyzer.calculate_bounce_rate()
                country_bounce_rates[country] = country_bounce_result['overall_bounce_rate']
                # Calculate average session depth for the country.
                country_visits = country_df.groupby('id_visit').size()
                country_avg_depth[country] = country_visits.mean() if len(country_visits) > 0 else 0
                # Identify the top visited section for the country.
                country_top_sections[country] = country_df['section'].value_counts(normalize=True).head(1).to_dict()

        bounce_df = pd.DataFrame(list(country_bounce_rates.items()), columns=['Country', 'Bounce_Rate']).sort_values(by='Bounce_Rate', ascending=False)
        depth_df = pd.DataFrame(list(country_avg_depth.items()), columns=['Country', 'Avg_Depth']).sort_values(by='Avg_Depth', ascending=False)
        
        print("\nBounce Rate by Country:\n", bounce_df.to_string())
        if not bounce_df.empty:
            highest_bounce_country = bounce_df.iloc[0]['Country']
            highest_bounce_country_rate = bounce_df.iloc[0]['Bounce_Rate']
            insights.append(f"**Insight:** There are notable variations in bounce rates across countries. For instance, '{highest_bounce_country}' has the highest bounce rate ({highest_bounce_country_rate:.2%}), suggesting potential cultural or content relevance issues in that region that need investigation.")
            metrics_data['highest_bounce_country'] = highest_bounce_country
            metrics_data['highest_bounce_country_rate'] = round(highest_bounce_country_rate, 4)
        
        print("\nAverage Session Depth by Country:\n", depth_df.to_string())
        if not depth_df.empty:
            deepest_engagement_country = depth_df.iloc[0]['Country']
            deepest_engagement_country_depth = depth_df.iloc[0]['Avg_Depth']
            insights.append(f"**Insight:** Session depth also varies by country, with users from '{deepest_engagement_country}' showing the deepest engagement (avg. depth {deepest_engagement_country_depth:.2f} pages). This could indicate a strong content fit or higher intent from these users.")
            metrics_data['deepest_engagement_country'] = deepest_engagement_country
            metrics_data['deepest_engagement_country_depth'] = round(deepest_engagement_country_depth, 2)

        print("\nTop Visited Section by Country:\n", pd.DataFrame(country_top_sections).T.to_string())
        insights.append("**Insight:** Top visited sections differ across countries. This suggests regional content preferences or specific information needs. For example, some countries may prioritize 'News & Press' while others focus on 'Our Business'. This insight is crucial for content localization and targeted campaigns.")
        metrics_data['country_top_sections'] = pd.DataFrame(country_top_sections).to_dict()

        # Visualize country-wise behavior comparison.
        fig11, axes = plt.subplots(1, 2, figsize=(16, 7))
        sns.barplot(x='Country', y='Bounce_Rate', data=bounce_df, palette='viridis', ax=axes[0])
        axes[0].set_title('Bounce Rate by Country')
        axes[0].set_ylabel('Bounce Rate')
        axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right') # Rotate x-axis labels for readability.

        sns.barplot(x='Country', y='Avg_Depth', data=depth_df, palette='plasma', ax=axes[1])
        axes[1].set_title('Average Session Depth by Country')
        axes[1].set_ylabel('Average Pages per Visit')
        axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
        plt.tight_layout() # Adjust layout to prevent labels from overlapping.
        self._collect_chart_info(fig11, f'Behavior Comparison by Top {top_n_countries} Countries (Bounce Rate & Avg Depth)', f"Bar charts comparing bounce rates and average session depth across the top {top_n_countries} countries. Shows significant regional variations, with '{highest_bounce_country}' having the highest bounce rate and '{deepest_engagement_country}' the deepest engagement.")

        return {
            'country_bounce_rates': bounce_df.to_dict('records'),
            'country_avg_depth': depth_df.to_dict('records'),
            'country_top_sections': country_top_sections,
            'insights': insights,
            'metrics_data': metrics_data
        }

    def generate_user_story(self, visitor_id):
        """
        Generates a narrative user story for a specific visitor ID based on their profile.
        Args:
            visitor_id (int/float): The unique ID of the visitor.
        Returns:
            str: A formatted string representing the user's story.
        """
        print(f"\n--- Generating User Story for Visitor ID: {visitor_id} ---")
        
        # Ensure visitor profiles are generated. Call internal logic to avoid duplicate full DataFrame printing.
        visitor_profiles_results = self.generate_visitor_profiles()
        visitor_profiles = visitor_profiles_results['visitor_profiles_df']
        
        # Retrieve the profile for the specific visitor_id.
        user_profile = visitor_profiles[visitor_profiles['visitor_id'] == visitor_id]

        if user_profile.empty:
            print(f"No profile found for visitor_id: {visitor_id}")
            return "No user story generated."

        user_profile = user_profile.iloc[0] # Get the first (and only) row of the profile.
        
        story_lines = [f"## User Story: Visitor ID {int(user_profile['visitor_id'])}"] # Start the story with a heading.

        # Describe visitor type (new or returning).
        if user_profile['is_repeat_visitor']:
            story_lines.append("This individual is a **returning visitor** to our website.")
        else:
            story_lines.append("This individual is a **new visitor** to our website.")
        
        # Describe average session depth.
        story_lines.append(f"They typically view an average of **{user_profile['avg_session_depth']:.2f} pages per session**, indicating their typical level of engagement within each visit.")
        
        # Describe download behavior.
        if user_profile['download_count'] > 0:
            story_lines.append(f"A key action for this user is downloading content, as they have initiated **{int(user_profile['download_count'])} downloads**.")
            if user_profile['frequent_downloader']:
                story_lines.append("They are categorized as a **frequent downloader**, suggesting a high propensity for consuming downloadable resources.")
        else:
            story_lines.append("They have not initiated any downloads during their sessions, which might indicate different information needs or engagement patterns.")

        # Describe top visited section.
        if user_profile['top_visited_section'] != 0: # Ensure a valid top section was assigned.
            story_lines.append(f"Their primary area of interest on the site is the **'{user_profile['top_visited_section']}' section**, as indicated by the highest proportion of their pageviews.")
        
        # Describe IR and ESG interest.
        if user_profile['ir_only_visitor']:
            story_lines.append("This user **primarily browses Investor Relations content only** (tagged as an 'IR-only visitor').")
        elif user_profile['investor_interest_score'] > 0:
            story_lines.append(f"They show a notable interest in Investor Relations, with **{int(user_profile['investor_interest_score'])} pageviews** in this section, indicating they are likely tracking company performance or news.")
        
        if user_profile['esg_visitor']:
            story_lines.append("They demonstrate a **strong interest in ESG-related content**.")
        
        # Describe path depth and bounce rate tendency.
        if user_profile['deep_path_visitor']:
            story_lines.append("This user typically engages in **deep Browse paths**, exploring more than 5 pages per session, suggesting a thorough and investigative approach to content consumption.")
        elif user_profile['avg_session_depth'] <= 1.1 and user_profile['avg_session_depth'] > 0:
            story_lines.append("Conversely, this user exhibits **high bounce rate behavior**, often leaving after viewing just one page, which could signal a quick search for specific information or a lack of immediate relevance.")

        # Include aggregated tags.
        story_lines.append(f"\n**Aggregated User Tags:** {user_profile['user_tags']}.")

        # Analyze and list typical session paths for this user.
        user_visits = self.df[self.df['visitor_id'] == visitor_id].sort_values('timestamp')
        if not user_visits.empty:
            # For each visit, create a path string.
            user_visits['section_path'] = user_visits.groupby('id_visit')['section'].transform(lambda x: ' -> '.join(x.astype(str)))
            unique_paths = user_visits.drop_duplicates(subset='section_path')['section_path']
            if not unique_paths.empty:
                story_lines.append("\n**Typical Session Paths Observed:**")
                for i, path in enumerate(unique_paths.head(3)): # Show up to 3 typical paths.
                    story_lines.append(f"- {path}")
            else:
                story_lines.append("\nNo distinct session paths found for this user within the dataset.")

        user_story = "\n".join(story_lines)
        print(user_story) # Print the generated story.
        return user_story

    def get_top_n_paths(self, n=5):
        """
        Helper function to get the top N most common session paths across all visits.
        Args:
            n (int): Number of top paths to return.
        Returns:
            pandas.Series: A Series of top N paths and their counts.
        """
        df_sorted = self.df.sort_values(['id_visit', 'timestamp'])
        df_sorted['path'] = df_sorted.groupby('id_visit')['section'].transform(lambda x: ' -> '.join(x.astype(str)))
        path_counts = df_sorted['path'].value_counts()
        return path_counts.head(n)

    def analyze_most_common_paths(self):
        """
        Analyzes and prints the most common session paths.
        Generates insights and collects key metrics.
        Returns:
            dict: A dictionary containing top common paths, text insights, and key numeric data.
        """
        print("\n--- Most Common Session Paths ---")
        insights = []
        metrics_data = {}
        top_paths = self.get_top_n_paths(n=10)
        print("Top 10 Most Common Session Paths:\n", top_paths.to_string())
        if not top_paths.empty:
            top_path_string = top_paths.index[0]
            top_path_count = top_paths.iloc[0]
            insights.append(f"**Insight:** Identifying the most common session paths, such as the top path `'{top_path_string}'` with {top_path_count} occurrences, provides a clear understanding of typical user navigation patterns.")
            metrics_data['top_common_path_string'] = top_path_string
            metrics_data['top_common_path_count'] = int(top_path_count)
            
        return {
            'top_common_paths': top_paths.to_dict(),
            'insights': insights,
            'metrics_data': metrics_data
        }

In [19]:
%pip install markdown
%pip install WeasyPrint
import os
import pprint
import google.generativeai as genai
from dotenv import load_dotenv
import io
import sys
import matplotlib.pyplot as plt
import pandas as pd
import markdown
import base64 # NEW: Import base64 for embedding images
from weasyprint import HTML # NEW: Import HTML from weasyprint
# Assuming DataLoader and BehaviorAnalyzer classes are defined above.

def run_behavior_analysis(file_path, home_company_section="home page"):
    """
    Orchestrates the entire behavior analysis process.
    Loads data, preprocesses it, runs various analysis modules,
    and collects all generated insights, metrics, and chart information.
    Args:
        file_path (str): Path to the raw traffic data CSV file.
        home_company_section (str): The default landing section name for funnel analysis.
    Returns:
        dict: A comprehensive dictionary containing the processed DataFrame,
              all collected insights, detailed metrics, and chart information.
    """
    print("--- Starting Data Loading and Preprocessing ---")
    data_loader = DataLoader(file_path)
    df = data_loader.load_and_preprocess_data()
    df = df[df['section'].notna() & (df['section'].astype(str).str.strip() != '')]
    df[['company_section', 'company_subsection']] = df['url'].apply(
        lambda url: pd.Series(DataLoader.get_company_category(url))
    )
    print("\nAdded 'company_section' and 'company_subsection'. First 5 rows:\n", df.head().to_string())
    df.to_csv('processed_data.csv', index=False, encoding='utf-8-sig')
    print("Saved processed_data.csv")
    
    print("\n--- Starting Behavior Analysis ---")
    analyzer = BehaviorAnalyzer(df)

    all_insights = []
    detailed_metrics = {}

    pageview_metrics = analyzer.calculate_pageview_metrics()
    all_insights.extend(pageview_metrics.get('insights', []))
    detailed_metrics['pageview_metrics'] = pageview_metrics.get('metrics_data', {})

    bounce_results = analyzer.calculate_bounce_rate()
    all_insights.extend(bounce_results.get('insights', []))
    detailed_metrics['bounce_results'] = bounce_results.get('metrics_data', {})

    download_results = analyzer.analyze_download_behavior()
    all_insights.extend(download_results.get('insights', []))
    detailed_metrics['download_results'] = download_results.get('metrics_data', {})

    funnel_results = analyzer.perform_funnel_analysis(home_company_section_name=home_company_section)
    all_insights.extend(funnel_results.get('insights', []))
    detailed_metrics['funnel_results'] = funnel_results.get('metrics_data', {})

    avg_depth_results = analyzer.calculate_average_depth()
    all_insights.extend(avg_depth_results.get('insights', []))
    detailed_metrics['avg_depth_results'] = avg_depth_results.get('metrics_data', {})

    unique_investors_results = analyzer.analyze_unique_investors()
    all_insights.extend(unique_investors_results.get('insights', []))
    detailed_metrics['unique_investors_results'] = unique_investors_results.get('metrics_data', {})

    subsection_details_results = analyzer.analyze_sub_section_details()
    all_insights.extend(subsection_details_results.get('insights', []))
    detailed_metrics['subsection_details_results'] = subsection_details_results.get('metrics_data', {})

    session_path_results = analyzer.analyze_session_paths()
    all_insights.extend(session_path_results.get('insights', []))
    detailed_metrics['session_path_results'] = session_path_results.get('metrics_data', {})

    time_distribution_results = analyzer.analyze_time_distribution()
    all_insights.extend(time_distribution_results.get('insights', []))
    detailed_metrics['time_distribution_results'] = time_distribution_results.get('metrics_data', {})

    session_depth_repeat_results = analyzer.analyze_session_path_length_and_repeat_visitors()
    all_insights.extend(session_depth_repeat_results.get('insights', []))
    detailed_metrics['session_depth_repeat_results'] = session_depth_repeat_results.get('metrics_data', {})

    new_vs_returning_results = analyzer.compare_new_vs_returning()
    all_insights.extend(new_vs_returning_results.get('insights', []))
    detailed_metrics['new_vs_returning_results'] = new_vs_returning_results.get('metrics_data', {})

    country_comparison_results = analyzer.compare_by_country()
    all_insights.extend(country_comparison_results.get('insights', []))
    detailed_metrics['country_comparison_results'] = country_comparison_results.get('metrics_data', {})

    common_paths_results = analyzer.analyze_most_common_paths()
    all_insights.extend(common_paths_results.get('insights', []))
    detailed_metrics['common_paths_results'] = common_paths_results.get('metrics_data', {})

    visitor_profiles_initial_results = analyzer.generate_visitor_profiles()
    all_insights.extend(visitor_profiles_initial_results.get('insights', []))
    detailed_metrics['visitor_profiles_data'] = visitor_profiles_initial_results.get('metrics_data', {})

    clustering_results = analyzer.perform_clustering_and_profiling(n_clusters=5)
    all_insights.extend(clustering_results.get('insights', []))
    detailed_metrics['clustering_results'] = clustering_results.get('metrics_data', {})
    
    df_visitor_profiles_with_clusters_full = clustering_results.get('visitor_profiles_with_clusters', pd.DataFrame()) 

    print("\n--- Behavior Analysis Complete ---")
    return {
        "df": df,
        "all_insights": all_insights,
        "detailed_metrics": detailed_metrics,
        "all_chart_infos": analyzer.all_chart_infos,
        "df_visitor_profiles_with_clusters_full": df_visitor_profiles_with_clusters_full,
        # ADD THIS LINE: Explicitly pass cluster_descriptions to the final results
        "cluster_descriptions": clustering_results.get('cluster_descriptions', {}) 
    }


def generate_executive_summary(analysis_results: dict) -> str:
    """
    Generates a high-level executive summary based on the analysis results.
    This summary is human-written but integrates specific metrics from the analysis.
    Args:
        analysis_results (dict): The complete results dictionary from run_behavior_analysis.
    Returns:
        str: A formatted string representing the executive summary.
    """
    print("\n--- Executive Summary: Key Highlights & Recommendations ---")
    summary_lines = []

    summary_lines.append("## Executive Summary")
    # 彻底移除所有 和 标签
    summary_lines.append("This analysis provides a comprehensive overview of user behavior on our website, identifying critical engagement patterns, areas ripe for improvement, and significant opportunities to enhance our investor relations and overall business objectives. Understanding these dynamics is key to driving targeted strategies and maximizing our digital presence's impact.")
    summary_lines.append("\n**Key Highlights:**")
    
    # Safely get metrics using .get() with default values
    detailed_metrics = analysis_results.get('detailed_metrics', {})
    pageview_metrics = detailed_metrics.get('pageview_metrics', {})
    bounce_results = detailed_metrics.get('bounce_results', {})
    download_results = detailed_metrics.get('download_results', {})
    new_vs_returning_results = detailed_metrics.get('new_vs_returning_results', {})
    clustering_results_metrics = detailed_metrics.get('clustering_results', {})
    time_distribution_results = detailed_metrics.get('time_distribution_results', {})
    country_comparison_results = detailed_metrics.get('country_comparison_results', {})

    top_pageview_count = pageview_metrics.get('top_pageview_count', 'N/A')
    top_download_count = download_results.get('top_download_count', 'N/A')
    overall_bounce_rate = bounce_results.get('overall_bounce_rate', 0.0)
    highest_bounce_section = bounce_results.get('highest_bounce_section', 'N/A')
    highest_bounce_value = bounce_results.get('highest_bounce_value', 0.0)

    new_depth = new_vs_returning_results.get('new_depth_comparison', 0.0)
    returning_depth = new_vs_returning_results.get('returning_depth_comparison', 0.0)
    new_bounce_rate = new_vs_returning_results.get('new_bounce_rate_comparison', 0.0)
    returning_bounce_rate = new_vs_returning_results.get('returning_bounce_rate_comparison', 0.0)

    num_frequent_downloaders = clustering_results_metrics.get('num_frequent_downloaders', 'N/A')
    num_esg_visitors = clustering_results_metrics.get('num_esg_visitors', 'N/A')
    num_clusters = len(clustering_results_metrics.get('cluster_summary', {})) if 'cluster_summary' in clustering_results_metrics else 0

    peak_weekday = time_distribution_results.get('peak_weekday', 'N/A')
    peak_hour = time_distribution_results.get('peak_hour', 'N/A')

    relevant_highlights = []

    if top_pageview_count != 'N/A' and top_download_count != 'N/A':
        relevant_highlights.append(f"**1. Investor Relations Dominance:** The 'Investor Relations' section is our most popular area, with {top_pageview_count} pageviews and {top_download_count} downloads. This clearly demonstrates strong user interest in our financial and operational performance, indicating this section successfully attracts high-intent visitors.")

    if overall_bounce_rate != 0.0 and highest_bounce_section != 'N/A':
        relevant_highlights.append(f"**2. High Overall Bounce Rate:** A significant {overall_bounce_rate:.2%} overall website bounce rate suggests many visitors leave after viewing only one page. The 'Blog' section exhibits the highest bounce rate at {highest_bounce_value:.2%}, highlighting a critical area for improving initial user experience and content relevance.")

    if new_depth != 0.0 or returning_depth != 0.0:
        relevant_highlights.append(f"**3. Returning Visitors are More Engaged:** Our analysis shows returning visitors are significantly more engaged (average depth {returning_depth:.2f} pages) and have a much lower bounce rate ({returning_bounce_rate:.2%}) compared to new visitors (average depth {new_depth:.2f} pages, bounce rate {new_bounce_rate:.2%}). This underscores the value of nurturing repeat visits and the opportunity to better onboard new users.")

    if num_clusters > 0:
        freq_dl_count = f"({num_frequent_downloaders} users)" if num_frequent_downloaders != 'N/A' else ""
        esg_count = f"({num_esg_visitors} users)" if num_esg_visitors != 'N/A' else ""
        relevant_highlights.append(f"**4. User Segmentation Reveals Diverse Behaviors:** We've identified {num_clusters} distinct user clusters, each with unique engagement patterns. For example, we have segments characterized by 'Frequent Downloaders' {freq_dl_count} and 'ESG-Focused' users {esg_count}. This segmentation provides a powerful tool for developing highly targeted content and communication strategies.")

    if peak_weekday != 'N/A' and peak_hour != 'N/A':
        relevant_highlights.append(f"**5. Peak Traffic during Business Hours:** Website traffic is highest on {peak_weekday}, with peak activity around {peak_hour}:00. This insight allows us to strategically time content updates and promotional activities for maximum immediate impact and reach.")

    if country_comparison_results and 'deepest_engagement_country' in country_comparison_results:
        deepest_engagement_country = country_comparison_results.get('deepest_engagement_country', 'N/A')
        deepest_engagement_country_depth = country_comparison_results.get('deepest_engagement_country_depth', 0.0)
        if deepest_engagement_country != 'N/A':
            relevant_highlights.append(
                f"**6. Country-Specific Behavior:** Users from '{deepest_engagement_country}' exhibit the deepest engagement (average depth {deepest_engagement_country_depth:.2f} pages), suggesting strong regional content preferences. This opens opportunities for localized content and targeted campaigns."
            )

    summary_lines.extend(relevant_highlights)

    summary_lines.append("\n**Overarching Recommendations:**")
    summary_lines.append("1. **Content Optimization & Conversion Pathways:** Further enhance high-value content areas like 'Investor Relations' and 'Reports & Presentations'. Ensure clear, intuitive pathways for users to access and download these critical resources, and explore ways to guide them to other relevant content.")
    summary_lines.append("2. **Improve First Impressions & Reduce Bounce:** Systematically analyze and optimize landing pages, especially those with high bounce rates. Focus on improving initial user experience, clarity of purpose, and clear calls-to-action to encourage deeper exploration for all visitors, particularly new ones.")
    summary_lines.append("3. **Cross-Device User Experience:** Implement continuous rigorous testing and optimization for mobile devices. Given the prevalence of mobile users, a seamless and responsive design is paramount for all content, especially high-engagement areas.")
    summary_lines.append("4. **Targeted Engagement Strategies:** Utilize generated user profiles and segment-specific insights (e.g., frequent downloaders, ESG visitors, country-specific interests) to personalize content recommendations and communications, fostering higher engagement and conversion rates.")
    summary_lines.append("5. **Strategic Content Scheduling:** Align content releases, website updates, and promotional activities with identified peak traffic hours and weekdays to maximize reach and immediate impact.")
    summary_lines.append("6. **Deepen User Understanding:** Continue to leverage user profiling, user testing, and heatmap analysis to gain even finer-grained insights into user motivations and pain points.")

    executive_summary_text = "\n".join(summary_lines)
    print(executive_summary_text)
    return executive_summary_text

def generate_analysis_report(analysis_results, executive_summary_text):
    """
    Generates a comprehensive analysis report in PDF format, including:
    - Human-generated executive summary.
    - AI-generated business narrative.
    - Structured detailed analysis from metrics.
    - All generated charts (embedded as Base64).
    Args:
        analysis_results (dict): The complete results dictionary from run_behavior_analysis.
    """
    load_dotenv()
    GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
    if not GOOGLE_API_KEY:
        raise ValueError("GOOGLE_API_KEY is not set. Please set it in your .env file.")
    genai.configure(api_key=GOOGLE_API_KEY)

    pp = pprint.PrettyPrinter(indent=2, width=120, compact=False)
    
    # Prepare images as Base64 for embedding in HTML
    embedded_images_html = []
    for i, chart_info in enumerate(analysis_results['all_chart_infos']):
        img_base64 = base64.b64encode(chart_info['image_buffer'].getvalue()).decode('utf-8')
        img_src = f"data:image/png;base64,{img_base64}"
        embedded_images_html.append({
            'title': chart_info['title'],
            'description': chart_info['description'],
            'src': img_src
        })

    # Generate Gemini narrative
    gemini_input_data_for_prompt = {
        "all_generated_insights": analysis_results['all_insights'],
        "detailed_metrics_summary": analysis_results['detailed_metrics'],
        "chart_descriptions": [f"Chart Title: {info['title']}. Description: {info['description']}" for info in embedded_images_html]
    }
    
    gemini_prompt = f"""
        You are a data analysis expert, tasked with creating a highly compelling and actionable business narrative based on provided web user behavior analytics. Your goal is to synthesize the insights, use specific data points to support your claims, and present a clear story for non-technical stakeholders, driving strategic decisions.

        **Here is a comprehensive summary of the analysis, including pre-generated insights, detailed metrics, and descriptions of accompanying charts:**

        ```json
        {pp.pformat(gemini_input_data_for_prompt)}
        ```
        
        **Instructions:**
        -   **Start with an Executive Summary (1-2 paragraphs):** Provide a high-level overview of the most critical findings and their strategic importance.
        -   **Key Strategic Areas & Insights (2-4 sections):**
            -   For each area (e.g., Investor Relations Performance, User Engagement & Retention, Conversion Funnel, Audience Insights), deep dive into the findings.
            -   **CRUCIALLY, support every key point with specific data (numbers, percentages, top items) from the provided `detailed_metrics_summary` and `all_generated_insights`.** For example: "The 'Investor Relations' section is the most viewed with X pageviews, confirming its critical importance." or "Bounce rate is particularly high in the 'Blog' section at Y%."
            -   Integrate insights from chart descriptions to explain visual trends (e.g., "As seen in the 'Visits by Hour of Day' chart, traffic peaks at X:00...").
            -   Explain the business implication ("So What?").
        -   **Actionable Recommendations (Bulleted List):** Provide concrete, prioritized, and measurable actions based directly on your analysis. Ensure each recommendation links back to a problem or opportunity identified.
        -   **Next Steps/Further Exploration (Bulleted List):** Suggest what other data, analysis, or testing could be valuable for deeper understanding or validation.
        -   **Tone:** Maintain a professional, persuasive, and clear tone, suitable for business decision-makers. Avoid jargon where possible, or explain it briefly.
        -   **DO NOT include direct Python variable names or dict/list indexing (e.g., do not write `analysis_results['detailed_metrics']['bounce_results']['overall_bounce_rate']`). Just state the value.**
        -   **DO NOT write explicit citations like. All data and insights are assumed to be synthesized from the provided analysis results.**
        """

    model = genai.GenerativeModel('gemini-1.5-flash')
    print("\n\n===== Gemini AI Narrative for Business Stakeholders =====")
    gemini_narrative = "Gemini narrative could not be generated due to an error."
    try:
        response = model.generate_content(gemini_prompt)
        gemini_narrative = response.text
        print(gemini_narrative)

        with open('gemini_narrative.txt', 'w', encoding='utf-8') as f:
            f.write(gemini_narrative)
        print("\nGemini narrative saved to gemini_narrative.txt")

    except Exception as e:
        print(f"Error generating Gemini narrative: {e}")
    
    results_to_save_full = {k: v for k, v in analysis_results.items()}
    if isinstance(results_to_save_full.get('df_visitor_profiles_with_clusters_full'), pd.DataFrame):
        results_to_save_full['df_visitor_profiles_with_clusters_full'] = results_to_save_full['df_visitor_profiles_with_clusters_full'].to_dict('records')

    with open('results_full.txt', 'w', encoding='utf-8') as f:
        f.write(pp.pformat(results_to_save_full))
    print("Full analysis data saved to results_full.txt")


    # --- Start building structured HTML content ---
    html_content = "<html><head><title>Website User Behavior Analysis Report</title><style>"
    html_content += "body { font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; color: #333; }"
    html_content += "h1, h2, h3, h4 { color: #0056b3; border-bottom: 2px solid #eee; padding-bottom: 5px; margin-top: 30px; }"
    html_content += "pre { background-color: #f4f4f4; border: 1px solid #ddd; padding: 15px; overflow-x: auto; border-radius: 5px; white-space: pre-wrap; word-wrap: break-word; }"
    html_content += ".gemini-narrative { background-color: #e8f0f8; border-left: 5px solid #0056b3; padding: 20px; margin-top: 20px; border-radius: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1); }"
    html_content += "img { max-width: 100%; height: auto; display: block; margin: 20px auto; border: 1px solid #ddd; border-radius: 4px; padding: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1); }"
    html_content += "table { width: 100%; border-collapse: collapse; margin-top: 10px; }"
    html_content += "th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }"
    html_content += "th { background-color: #f2f2f2; }"
    html_content += "</style></head><body>"

    html_content += "<h1>Website User Behavior Analysis Report</h1>\n"
    html_content += "<h2>Executive Summary (Human-Generated)</h2>\n"
    html_content += f"<pre>{executive_summary_text}</pre>\n"

    html_content += "<h2>Gemini AI Business Narrative</h2>\n"
    html_content += f"<div class='gemini-narrative'>{markdown.markdown(gemini_narrative)}</div>\n"

    html_content += "<h2>Detailed Analysis & Metrics Summary</h2>\n"
    html_content += "<p>This section provides a detailed breakdown of the key metrics and insights derived from the analysis. All values are direct extractions from the data processing pipeline.</p>\n"

    for section_name, metrics in analysis_results['detailed_metrics'].items():
        html_content += f"<h3>{section_name.replace('_', ' ').title()}</h3>\n"

        if 'insights' in metrics and isinstance(metrics['insights'], list):
            html_content += "<h4>Key Insights:</h4>\n<ul>"
            for insight_text in metrics['insights']:
                html_content += f"<li>{insight_text.replace('**Insight:** ', '').strip()}</li>\n"
            html_content += "</ul>\n"
            metrics_for_table = {k: v for k, v in metrics.items() if k != 'insights'}
        else:
            metrics_for_table = metrics

        # Special handling for 'Clustering Results' to display descriptions instead of raw dict
        if section_name == 'clustering_results':
            if 'cluster_descriptions' in analysis_results and analysis_results['cluster_descriptions']: # Check top-level for cluster_descriptions
                html_content += "<h4>User Cluster Profiles:</h4>\n<ul>"
                for cluster_name, desc in analysis_results['cluster_descriptions'].items():
                    html_content += f"<li><b>{cluster_name}:</b> {desc}</li>\n"
                html_content += "</ul>\n"
            # You can still add other relevant metrics from clustering_results if needed, but exclude the huge summary dict
            if 'cluster_summary' in metrics_for_table:
                del metrics_for_table['cluster_summary'] # Remove the raw cluster summary from general metrics table
            if metrics_for_table: # If there are still other scalar metrics in clustering_results, print them
                 html_content += "<h4>Core Metrics:</h4>\n<table><tr><th>Metric</th><th>Value</th></tr>"
                 for metric_name, value in metrics_for_table.items():
                    if isinstance(value, np.ndarray):
                        value = value.tolist()
                    if isinstance(value, (dict, list)):
                        value = pp.pformat(value)
                        value = f"<pre>{value}</pre>"
                    elif isinstance(value, float):
                        value = f"{value:.4f}"
                    elif isinstance(value, (bool, np.bool_)):
                        value = str(bool(value))
                    
                    html_content += f"<tr><td>{metric_name.replace('_', ' ').title()}</td><td>{value}</td></tr>"
                 html_content += "</table>\n"

        elif isinstance(metrics_for_table, dict) and metrics_for_table:
            html_content += "<h4>Core Metrics:</h4>\n<table><tr><th>Metric</th><th>Value</th></tr>"
            for metric_name, value in metrics_for_table.items():
                if isinstance(value, np.ndarray):
                    value = value.tolist()
                if isinstance(value, (dict, list)):
                    value = pp.pformat(value)
                    value = f"<pre>{value}</pre>"
                elif isinstance(value, float):
                    value = f"{value:.4f}"
                elif isinstance(value, (bool, np.bool_)):
                    value = str(bool(value))
                
                html_content += f"<tr><td>{metric_name.replace('_', ' ').title()}</td><td>{value}</td></tr>"
            html_content += "</table>\n"
        elif metrics_for_table:
            html_content += f"<p>Additional data for {section_name}: <pre>{pp.pformat(metrics_for_table)}</pre></p>"
    
    html_content += "<h2>Supporting Charts</h2>\n"
    # ... (rest of the function, including chart embedding and glossary, remains unchanged) ...
    if not embedded_images_html:
        html_content += "<p>No charts were generated during this analysis.</p>\n"
    for chart_info in embedded_images_html:
        html_content += f"<h3 style='margin-top:30px;'>{chart_info['title']}</h3>\n"
        html_content += f"<p>{chart_info['description']}</p>\n"
        html_content += f'<img src=\"{chart_info["src"]}\" alt=\"{chart_info["title"]}\">\n'

    html_content += """
    <h2>Glossary of Key Terms</h2>
    <ul>
        <li><b>Bounce Rate:</b> The percentage of visits in which a user leaves the site after viewing only one page. High bounce rate may indicate low engagement or poor landing page relevance.</li>
        <li><b>Session Depth:</b> The average number of pages viewed per visit. Higher session depth indicates deeper engagement.</li>
        <li><b>Download Count:</b> The total number of times users downloaded files from the site.</li>
        <li><b>Investor Interest Score:</b> A metric representing the number of pageviews in Investor Relations sections, indicating interest in company performance.</li>
        <li><b>Content Breadth:</b> The number of unique sections a user visited, reflecting how widely they explored the site.</li>
        <li><b>Repeat Visitor:</b> A user who has visited the site more than once.</li>
        <li><b>High Intent:</b> A user who both downloads files and shows interest in Investor Relations content.</li>
        <li><b>Frequent Downloader:</b> A user who downloaded more than 3 files.</li>
        <li><b>Deep Path Visitor:</b> A user whose average session depth is greater than 5 pages.</li>
        <li><b>IR-Only Visitor:</b> A user who only browses Investor Relations content.</li>
        <li><b>ESG-Focused:</b> A user who visited ESG-related content.</li>
    </ul>
    """
    html_content += "</body></html>"

    with open('analysis_report.html', 'w', encoding='utf-8') as f:
        f.write(html_content)
    print("\nSaved analysis_report.html with embedded figures and structured metrics.")

    try:
        HTML(string=html_content).write_pdf('analysis_report.pdf', timeout=60)
        print("\nSuccessfully generated analysis_report.pdf with embedded figures and structured metrics!")
    except Exception as e:
        print(f"\nError generating PDF report using WeasyPrint: {e}")
        print("Please ensure WeasyPrint and its system dependencies (like GTK+) are correctly installed.")
        print("You can try opening analysis_report.html in a browser to view the report.")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
# Make sure you have python-dotenv installed for loading API key
# %pip install python-dotenv

# Define your file path
file_path = 'all_traffic_data_merged.csv'

# Run the behavior analysis
# This function will now print detailed analysis and insights, and return all results including collected insights.
results = run_behavior_analysis(file_path)

# Generate the full analysis report (including Gemini narrative)
# This function orchestrates printing the insights and calling Gemini
# It also generates the HTML report if the subsequent cell is run.
# Generate the executive summary separately
executive_summary_text = generate_executive_summary(results)

# Now, pass both results and executive_summary_text to generate_analysis_report
generate_analysis_report(analysis_results=results, executive_summary_text=executive_summary_text)

# You can still generate individual user stories if needed
# df_for_profiling = results['df'] # Get the processed dataframe from results
# analyzer_for_profiling = BehaviorAnalyzer(df_for_profiling) # Re-instantiate to access methods like generate_user_story
# # If you want to use the already computed visitor_profiles_df:
# # visitor_profiles_df_res = results['visitor_profiles_data']['visitor_profiles_df']
# example_visitor_id = 52.0 # Use an actual visitor_id from your data, ensure it's float if that's its type in the dataframe
# if example_visitor_id in analyzer_for_profiling.df['visitor_id'].unique():
#      analyzer_for_profiling.generate_user_story(example_visitor_id)
# else:
#      print(f"\nVisitor ID {example_visitor_id} not found for story generation.")

--- Starting Data Loading and Preprocessing ---


  df_all['is_new_visitor_session'] = df_all['id_visit'].map(is_first_session_for_visitor).fillna(False)



Added 'company_section' and 'company_subsection'. First 5 rows:
             timestamp  id_site  id_visit  time_spent                                                                            url search_keyword location_country             referrer_url                                  user_agent  time_since_last_page  total_hits next_session_start_timestamp session_source referred_by bot_flag  view_count  session_event_time_minute  download_flag download_filename  match_company_id     match_type  is_home_only_session  visitor_id ultimate_parent_name             section              sub-section          company  is_new_visitor_session company_section                 company_subsection
0 2025-03-05 18:36:58        5   1055801           0                 harbourenergy.com/investors/results-reports-and-presentations/            NaN               us                      NaN   Chrome 89.0 (GNU/Linux ; generic desktop)                1772.0           0          2025-03-05 18:39:53     Bookm


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=pageviews_per_category.index, y=pageviews_per_category.values, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=avg_duration_per_category.index, y=avg_duration_per_category.values, palette='plasma')



Average Duration per Category (non-terminal pages):
 section
Products & Innovation    169.411765
Blog                     142.612903
Sustainability           123.778877
Our Business             120.078431
Legal & Compliance       109.043250
News & Press              99.791159
About Us                  99.036615
Investor Relations        91.576573
Support                   91.215784
Resources                 85.189873

End Rate per Category:
 section
Investor Relations       0.502558
About Us                 0.145582
News & Press             0.118249
Our Business             0.028786
Sustainability           0.022455
Support                  0.020305
Legal & Compliance       0.003563
Resources                0.000776
Products & Innovation    0.000478
Blog                     0.000239

Real Interest Score:
 section
Products & Innovation    169.459542
Blog                     142.636792
Investor Relations       141.832380
Sustainability           126.024413
Our Business             122.9


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=download_section_counts.head(5).index, y=download_section_counts.head(5).values, palette='Blues_d')



Download counts by sub-section:
 sub-section
Reports & Presentations            8426
Financial Information              1235
ESG                                 569
Corporate Governance                242
Press Releases                      126
Social                               77
Regulatory Compliance                72
News & Alerts                        63
Sustainability & Responsibility      59
Environment                          46
Projects                             45
Events & Calendar                    27
Diversity & Inclusion                24
Shareholder Services                 23
Patient Resources                    23
Educational Materials                22
Stock & Dividend Information         13
Suppliers                            12
Working with integrity               11
Markets & Operations                 10
Terms of use                          8
News & Articles                       7
Business Model                        7
Our Culture                       


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=weekday_counts.index, y=weekday_counts.values, palette='viridis')


Short visits (1 page, filtered): 31635
Deep visits (>= 5 pages, filtered): 3253

Normal Repeat Visitors (visits per visitor):
 count    1675.000000
mean        7.595821
std        24.838635
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max       391.000000
Number of repeat visitors (filtered): 845

New visits: 1698
Returning visits: 48535
Average depth (new): 2.38
Average depth (returning): 2.21

--- Comparison: New vs. Returning Visitors ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=['New Visitors', 'Returning Visitors'], y=[new_depth, returning_depth], palette='coolwarm')


Average Session Depth - New: 2.38, Returning: 2.21

--- New Visitors Bounce Rate ---

--- Bounce Rate Analysis ---
Overall Bounce Rate: 51.00%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
3           News & Press     0.726744
4                Support     0.725806
2           Our Business     0.526316
1               About Us     0.501689
5         Sustainability     0.473684
7     Legal & Compliance     0.333333
0     Investor Relations     0.316733
6              Resources     0.000000
8  Products & Innovation     0.000000
9                   Blog     0.000000



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')



--- Returning Visitors Bounce Rate ---

--- Bounce Rate Analysis ---
Overall Bounce Rate: 63.40%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
6     Legal & Compliance     0.851190
8  Products & Innovation     0.818182
3                Support     0.727185
2           News & Press     0.709100
4         Sustainability     0.705448
9                   Blog     0.666667
0     Investor Relations     0.631377
5           Our Business     0.622665
1               About Us     0.576904
7              Resources     0.571429
Overall Bounce Rate - New: 51.00%, Returning: 63.40%
The following section contains the top 3 visited sections for New and Returning Visitors.
This data is useful for understanding initial content interests and how they evolve.

Top 3 Visited Sections (Proportion) - New Visitors:
 section
Investor Relations    0.429138
About Us              0.266353
News & Press          0.137760

Top 3 Visited Sections (Proportion)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')



Top 5 Countries by Pageviews: ['us', 'gb', 'ie', 'de', 'no']

--- Bounce Rate Analysis ---
Overall Bounce Rate: 73.12%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
8  Products & Innovation     1.000000
5     Legal & Compliance     0.901786
3         Sustainability     0.868976
7              Resources     0.833333
4           Our Business     0.830409
6                Support     0.786280
2           News & Press     0.764828
0     Investor Relations     0.722024
1               About Us     0.707020
9                   Blog     0.000000



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')



--- Bounce Rate Analysis ---
Overall Bounce Rate: 50.37%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
9                   Blog     1.000000
4                Support     0.680357
3           News & Press     0.660074
2           Our Business     0.587074
8     Legal & Compliance     0.526316
7  Products & Innovation     0.500000
6              Resources     0.500000
1               About Us     0.465728
0     Investor Relations     0.464221
5         Sustainability     0.392749



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')



--- Bounce Rate Analysis ---
Overall Bounce Rate: 73.22%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
6     Legal & Compliance     1.000000
4                Support     0.888889
0     Investor Relations     0.754731
3         Sustainability     0.666667
5           News & Press     0.630137
1               About Us     0.563025
2           Our Business     0.458333
7              Resources     0.000000
8  Products & Innovation     0.000000
9                   Blog     0.000000

--- Bounce Rate Analysis ---
Overall Bounce Rate: 49.39%

Bounce Rate per Section (based on visits starting in section):
                  section  bounce_rate
6         Sustainability     0.875000
1           News & Press     0.699029
5                Support     0.538462
7     Legal & Compliance     0.500000
4              Resources     0.500000
2           Our Business     0.488479
0               About Us     0.475336
3     Investor Relations     0.4


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bounce_df['section'], y=bounce_df['bounce_rate'], palette='Reds_d')



Bounce Rate per Section (based on visits starting in section):
               section  bounce_rate
7      Sustainability     0.812500
4             Support     0.800000
2        News & Press     0.606557
1        Our Business     0.513410
3  Investor Relations     0.478261
0            About Us     0.395735
5           Resources     0.000000
6  Legal & Compliance     0.000000

Bounce Rate by Country:
   Country  Bounce_Rate
2      ie     0.732176
0      us     0.731200
1      gb     0.503721
3      de     0.493933
4      no     0.466112

Average Session Depth by Country:
   Country  Avg_Depth
2      ie   5.319281
4      no   2.694411
1      gb   2.519394
3      de   2.439686
0      us   1.781947

Top Visited Section by Country:
     Investor Relations  About Us
us            0.626100       NaN
gb            0.570484       NaN
ie            0.468765       NaN
de            0.464307       NaN
no                 NaN   0.38526



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Country', y='Bounce_Rate', data=bounce_df, palette='viridis', ax=axes[0])
  axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right') # Rotate x-axis labels for readability.

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Country', y='Avg_Depth', data=depth_df, palette='plasma', ax=axes[1])
  axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')



--- Most Common Session Paths ---
Top 10 Most Common Session Paths:
 path
Investor Relations                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           