In [20]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime, timedelta
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import warnings
warnings.filterwarnings('ignore')

# Configuration
class Config:
    def __init__(self):
        self.data_path = "../../data/"
        self.buffer_size = 4  # Legacy - for backward compatibility
        self.buffer_before = 4  # Number of messages to show before session
        self.buffer_after = 1  # Number of messages to show after session
        self.max_message_length = 500  # Max characters to show per message
        self.show_metadata = True  # Show timestamps, IDs, etc.
        
config = Config()

print("🔍 Session Explorer loaded!")
print(f"📁 Data path: {config.data_path}")
print(f"⬅️  Buffer before: {config.buffer_before} messages")
print(f"➡️  Buffer after: {config.buffer_after} messages") 
print(f"📏 Max message length: {config.max_message_length} characters")


🔍 Session Explorer loaded!
📁 Data path: ../../data/
⬅️  Buffer before: 4 messages
➡️  Buffer after: 1 messages
📏 Max message length: 500 characters


In [21]:
def get_available_files():
    """Get list of available session detection results files."""
    pattern = os.path.join(config.data_path, "session_detection_results_*.xlsx")
    files = glob.glob(pattern)
    
    # Sort by modification time (newest first)
    files.sort(key=os.path.getmtime, reverse=True)
    
    # Extract just the filename for display
    file_names = [os.path.basename(f) for f in files]
    
    return files, file_names

files, file_names = get_available_files()

if files:
    print(f"📁 Found {len(files)} session detection files:")
    for i, name in enumerate(file_names):
        print(f"   {i+1}. {name}")
    
    # Default to most recent file
    default_file = files[0]
    print(f"\n✅ Defaulting to most recent: {file_names[0]}")
else:
    print("❌ No session detection files found!")
    default_file = None


📁 Found 29 session detection files:
   1. session_detection_results_20250707_190642.xlsx
   2. session_detection_results_20250707_161113.xlsx
   3. session_detection_results_20250707_153015.xlsx
   4. session_detection_results_20250707_151325.xlsx
   5. session_detection_results_20250707_132808.xlsx
   6. session_detection_results_20250707_130407.xlsx
   7. session_detection_results_20250707_123443.xlsx
   8. session_detection_results_20250707_115421.xlsx
   9. session_detection_results_20250707_115241.xlsx
   10. session_detection_results_20250707_114449.xlsx
   11. session_detection_results_20250707_114429.xlsx
   12. session_detection_results_20250707_113838.xlsx
   13. session_detection_results_20250707_113456.xlsx
   14. session_detection_results_20250707_112800.xlsx
   15. session_detection_results_20250707_110147.xlsx
   16. session_detection_results_20250707_105149.xlsx
   17. session_detection_results_20250703_185228.xlsx
   18. session_detection_results_20250703_175741.xlsx
 

In [22]:
def load_session_data(file_path):
    """Load and prepare session data from Excel file."""
    print(f"📂 Loading data from: {os.path.basename(file_path)}")
    
    try:
        # Load the Excel file
        df = pd.read_excel(file_path)
        
        # Ensure datetime columns are parsed
        df['created_at'] = pd.to_datetime(df['created_at'])
        
        # Sort by channel and time
        df = df.sort_values(['gpt_channel_id', 'created_at'])
        
        # Get session info
        sessions = df['session_id'].unique()
        
        print(f"📊 Loaded {len(df)} messages")
        print(f"🎯 Found {len(sessions)} unique sessions")
        print(f"📞 Channels: {df['gpt_channel_id'].nunique()}")
        
        # Check for ground truth labels
        has_ground_truth = 'is_session_start' in df.columns
        print(f"✅ Ground truth labels: {'Yes' if has_ground_truth else 'No'}")
        
        return df, sessions, has_ground_truth
        
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None, None, False

# Load default file if available
if default_file:
    df, sessions, has_ground_truth = load_session_data(default_file)
    current_session_idx = 0
else:
    df, sessions, has_ground_truth = None, None, False
    current_session_idx = 0


📂 Loading data from: session_detection_results_20250707_190642.xlsx
📊 Loaded 2917 messages
🎯 Found 657 unique sessions
📞 Channels: 50
✅ Ground truth labels: Yes


In [23]:
# Interactive configuration with separate before/after buffers
buffer_before_widget = widgets.IntSlider(
    value=config.buffer_before,
    min=0,
    max=10,
    step=1,
    description='Buffer before:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

buffer_after_widget = widgets.IntSlider(
    value=config.buffer_after,
    min=0,
    max=10,
    step=1,
    description='Buffer after:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

max_length_widget = widgets.IntSlider(
    value=config.max_message_length,
    min=100,
    max=1000,
    step=50,
    description='Max msg length:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

show_metadata_widget = widgets.Checkbox(
    value=config.show_metadata,
    description='Show metadata',
    disabled=False
)

def update_config(buffer_before, buffer_after, max_length, show_metadata):
    config.buffer_before = buffer_before
    config.buffer_after = buffer_after
    config.max_message_length = max_length
    config.show_metadata = show_metadata
    print(f"Updated config: before={buffer_before}, after={buffer_after}, max_length={max_length}, metadata={show_metadata}")

# Ensure config properties are properly set
if not hasattr(config, 'buffer_before'):
    config.buffer_before = config.buffer_size
if not hasattr(config, 'buffer_after'):
    config.buffer_after = 1

interactive_config = interactive(update_config, 
                                buffer_before=buffer_before_widget,
                                buffer_after=buffer_after_widget,
                                max_length=max_length_widget,
                                show_metadata=show_metadata_widget)

display(interactive_config)


interactive(children=(IntSlider(value=4, continuous_update=False, description='Buffer before:', max=10), IntSl…

In [24]:
def format_time_gap(current_time, prev_time):
    """Format time gap between messages."""
    if pd.isna(prev_time) or pd.isna(current_time):
        return "[START]"
    
    gap = current_time - prev_time
    total_seconds = gap.total_seconds()
    
    if total_seconds < 60:
        return f"[+{int(total_seconds)}s]"
    elif total_seconds < 3600:
        return f"[+{int(total_seconds//60)}m]"
    elif total_seconds < 86400:
        return f"[+{int(total_seconds//3600)}h]"
    else:
        return f"[+{int(total_seconds//86400)}d]"

def truncate_message(message, max_length):
    """Truncate message to max length."""
    if pd.isna(message):
        return "[No message]"
    
    message = str(message)
    if len(message) <= max_length:
        return message
    
    return message[:max_length] + "..."

def get_content_field(df):
    """Auto-detect the content field from common column names."""
    possible_content_fields = ['content', 'message', 'text', 'body', 'message_text', 'msg']
    for field in possible_content_fields:
        if field in df.columns:
            return field
    return None

def get_role_field(df):
    """Auto-detect the role field from common column names."""
    possible_role_fields = ['role', 'sender', 'author', 'user_type', 'message_type']
    for field in possible_role_fields:
        if field in df.columns:
            return field
    return None

def get_session_messages(session_id, buffer_before=3, buffer_after=3):
    """Get messages for a session with separate before/after buffers."""
    if df is None:
        return pd.DataFrame(), None, None
    
    # Auto-detect content and role fields
    content_field = get_content_field(df)
    role_field = get_role_field(df)
    
    # Get messages for this session
    session_messages = df[df['session_id'] == session_id].copy()
    
    if session_messages.empty:
        return pd.DataFrame(), content_field, role_field
    
    # Get the channel for this session
    channel = session_messages['gpt_channel_id'].iloc[0]
    
    # Get all messages for this channel
    channel_messages = df[df['gpt_channel_id'] == channel].copy()
    
    # Find the indices of session messages in the channel
    session_indices = channel_messages.index.isin(session_messages.index)
    
    # Find the start and end of the session
    session_start_idx = np.where(session_indices)[0][0]
    session_end_idx = np.where(session_indices)[0][-1]
    
    # Add separate before and after buffers
    buffer_start = max(0, session_start_idx - buffer_before)
    buffer_end = min(len(channel_messages), session_end_idx + buffer_after + 1)
    
    # Get the buffered messages
    buffered_messages = channel_messages.iloc[buffer_start:buffer_end].copy()
    
    # Mark which messages are in the current session
    buffered_messages['in_current_session'] = buffered_messages.index.isin(session_messages.index)
    
    # Mark the actual session start (first message in current session)
    buffered_messages['is_current_session_start'] = False
    if len(session_messages) > 0:
        first_session_msg_idx = session_messages.index[0]
        if first_session_msg_idx in buffered_messages.index:
            buffered_messages.loc[first_session_msg_idx, 'is_current_session_start'] = True
    
    # Add previous timestamp for time gap calculation
    buffered_messages['prev_created_at'] = buffered_messages['created_at'].shift(1)
    
    return buffered_messages, content_field, role_field


In [25]:
def display_session(session_id, buffer_before=None, buffer_after=None):
    """Display a session with context in modern chat format."""
    if df is None:
        print("❌ No data loaded!")
        return
    
    if buffer_before is None:
        buffer_before = getattr(config, 'buffer_before', config.buffer_size)
    if buffer_after is None:
        buffer_after = getattr(config, 'buffer_after', config.buffer_size)
    
    # Get messages with buffer and field info
    messages, content_field, role_field = get_session_messages(session_id, buffer_before, buffer_after)
    
    if messages.empty:
        print(f"❌ No messages found for session {session_id}")
        return
    
    # Session info
    session_messages = messages[messages['in_current_session']]
    channel = session_messages['gpt_channel_id'].iloc[0]
    
    print("\n" + "█" * 90)
    print(f"🎯 SESSION: {session_id}")
    print(f"📞 Channel: {channel}")
    print(f"📊 Messages in session: {len(session_messages)}")
    print(f"⏰ Timespan: {session_messages['created_at'].min().strftime('%Y-%m-%d %H:%M')} → {session_messages['created_at'].max().strftime('%Y-%m-%d %H:%M')}")
    
    # Show predictions summary if available
    if 'is_session_start_pred' in session_messages.columns:
        pred_starts = session_messages['is_session_start_pred'].sum()
        print(f"🤖 Predicted session starts: {pred_starts}")
    
    # Show ground truth comparison if available
    if has_ground_truth and 'is_session_start' in session_messages.columns:
        gt_numeric = session_messages['is_session_start'].apply(
            lambda x: 1 if (pd.notna(x) and x in [1, 1.0, '[START]']) else 0
        )
        pred_numeric = session_messages['is_session_start_pred'].fillna(0).astype(int)
        
        gt_starts = gt_numeric.sum()
        matches = (gt_numeric == pred_numeric).sum()
        accuracy = matches / len(session_messages) if len(session_messages) > 0 else 0
        
        print(f"✅ Ground truth starts: {gt_starts}")
        print(f"🎯 Accuracy: {accuracy:.1%} ({matches}/{len(session_messages)} correct)")
    
    print("█" * 90)
    
    # Display messages in chat format
    message_count = 0
    session_message_count = 0
    
    for idx, row in messages.iterrows():
        message_count += 1
        
        # Format time gap
        time_gap = format_time_gap(row['created_at'], row['prev_created_at'])
        
        # Determine role using detected field or fallback
        if role_field and role_field in row and pd.notna(row[role_field]):
            role = str(row[role_field]).strip()
        else:
            # Fallback role detection
            role = 'unknown'
            
        # Get message content using detected field
        if content_field and content_field in row:
            content = row[content_field]
        else:
            content = row.get('content', '[No content found]')
        
        # Truncate message
        message = truncate_message(content, config.max_message_length)
        
        # Check if this is the start of the current session
        is_current_session_start = row.get('is_current_session_start', False)
        in_session = row['in_current_session']
        
        if in_session:
            session_message_count += 1
        
        # Modern chat formatting
        if in_session:
            if is_current_session_start:
                print(f"\n{'🔥' * 20} SESSION START {'🔥' * 20}")
                print(f"{'─' * 60}")
            
            # Current session message - highlighted
            role_emoji = "👤" if role == "patient" else ("🤖" if role == "joy" else "👨‍💼")
            print(f"\n┌─ {role_emoji} {role.upper()} • {time_gap} • Message #{session_message_count}")
            print(f"│")
            
            if message.strip():
                # Split long messages into lines
                lines = message.split('\n')
                for line in lines:
                    if len(line) > 70:
                        # Wrap long lines
                        words = line.split(' ')
                        current_line = ""
                        for word in words:
                            if len(current_line + word) > 70:
                                print(f"│  {current_line}")
                                current_line = word + " "
                            else:
                                current_line += word + " "
                        if current_line.strip():
                            print(f"│  {current_line}")
                    else:
                        print(f"│  {line}")
            else:
                print(f"│  [Empty message]")
            
            print(f"└{'─' * 50}")
            
        else:
            # Buffer message - more subdued
            role_emoji = "👤" if role == "patient" else ("🤖" if role == "joy" else "👨‍💼")
            print(f"\n╭─ {role_emoji} {role} • {time_gap} (buffer)")
            print(f"│")
            if message.strip():
                # Truncate buffer messages more aggressively
                truncated = message[:100] + "..." if len(message) > 100 else message
                print(f"│  {truncated}")
            else:
                print(f"│  [Empty message]")
            print(f"╰{'─' * 30}")
        
        # Show metadata if enabled
        if config.show_metadata:
            metadata = []
            if 'is_session_start_pred' in row:
                pred_val = row['is_session_start_pred']
                pred_icon = "🟢" if pred_val == 1 else "🔴"
                metadata.append(f"Pred:{pred_icon}{pred_val}")
            if has_ground_truth and 'is_session_start' in row:
                gt_val = row['is_session_start']
                gt_display = "1" if (pd.notna(gt_val) and gt_val in [1, 1.0, '[START]']) else "0"
                gt_icon = "🟢" if gt_display == "1" else "🔴"
                metadata.append(f"GT:{gt_icon}{gt_display}")
            
            if metadata:
                indent = "     " if in_session else "     "
                print(f"{indent}📋 {' | '.join(metadata)}")
    
    print(f"\n{'█' * 90}")
    print(f"📊 Displayed: {len(session_messages)} session messages + {len(messages) - len(session_messages)} buffer messages")
    print(f"{'█' * 90}\n")

# Test display if we have data
if df is not None and len(sessions) > 0:
    print("🔍 Sample session display:")
    display_session(sessions[0])
else:
    print("⚠️  No data loaded yet!")


🔍 Sample session display:

██████████████████████████████████████████████████████████████████████████████████████████
🎯 SESSION: session_1
📞 Channel: 658035
📊 Messages in session: 1
⏰ Timespan: 2025-06-15 05:54 → 2025-06-15 05:54
🤖 Predicted session starts: 1
✅ Ground truth starts: 1
🎯 Accuracy: 100.0% (1/1 correct)
██████████████████████████████████████████████████████████████████████████████████████████

🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥 SESSION START 🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥
────────────────────────────────────────────────────────────

┌─ 👨‍💼 UNKNOWN • [START] • Message #1
│
│  Hi Damian, I'm **Coach Joy**, your AI-powered coach—**here 24/7** to 
│  make your weight loss journey **simpler, smoother, and fully 
│  supported**. 
└──────────────────────────────────────────────────
     📋 Pred:🟢1 | GT:🟢1

╭─ 👨‍💼 unknown • [+0s] (buffer)
│
│  **Designed by experts** in behaviour change, nutrition, and lifestyle science, I bring **on-demand s...
╰──────────────────────────────
     📋 Pred:🟢1 | GT:🟢1

████

In [26]:
import random

class SessionNavigator:
    def __init__(self, sessions, df):
        self.sessions = sessions
        self.df = df
        self.current_idx = 0
        self.history = []
        
    def get_current_session(self):
        if self.sessions is None or len(self.sessions) == 0:
            return None
        return self.sessions[self.current_idx]
    
    def next_session(self):
        if self.sessions is None or len(self.sessions) == 0:
            return None
        
        self.history.append(self.current_idx)
        self.current_idx = (self.current_idx + 1) % len(self.sessions)
        return self.get_current_session()
    
    def prev_session(self):
        if self.sessions is None or len(self.sessions) == 0:
            return None
        
        self.history.append(self.current_idx)
        self.current_idx = (self.current_idx - 1) % len(self.sessions)
        return self.get_current_session()
    
    def random_session(self):
        if self.sessions is None or len(self.sessions) == 0:
            return None
        
        self.history.append(self.current_idx)
        self.current_idx = random.randint(0, len(self.sessions) - 1)
        return self.get_current_session()
    
    def goto_session(self, session_idx):
        if self.sessions is None or len(self.sessions) == 0:
            return None
        
        if 0 <= session_idx < len(self.sessions):
            self.history.append(self.current_idx)
            self.current_idx = session_idx
            return self.get_current_session()
        return None
    
    def get_session_info(self):
        if self.sessions is None or len(self.sessions) == 0:
            return "❌ No sessions available"
        
        current_session = self.get_current_session()
        return f"🎯 Session {self.current_idx + 1} of {len(self.sessions)}: {current_session}"

# Initialize navigator
if df is not None and sessions is not None:
    navigator = SessionNavigator(sessions, df)
    print(f"🚀 Navigator initialised with {len(sessions)} sessions")
else:
    navigator = None
    print("⚠️  No data loaded - navigator not available")


🚀 Navigator initialised with 657 sessions


In [27]:
# Create interactive navigation controls
def create_navigation_controls():
    if navigator is None:
        print("⚠️  No navigator available - please load data first")
        return
    
    # Create buttons
    prev_btn = widgets.Button(description="← Previous", button_style='info')
    next_btn = widgets.Button(description="Next →", button_style='info')
    random_btn = widgets.Button(description="🎲 Random", button_style='warning')
    refresh_btn = widgets.Button(description="🔄 Refresh", button_style='success')
    
    # Session selector
    session_selector = widgets.Dropdown(
        options=[(f"Session {i+1}: {sess}", i) for i, sess in enumerate(navigator.sessions)],
        value=navigator.current_idx,
        description='Jump to:',
        style={'description_width': 'initial'}
    )
    
    # Output area
    output = widgets.Output()
    
    def update_display():
        with output:
            clear_output(wait=True)
            print(navigator.get_session_info())
            print()
            current_session = navigator.get_current_session()
            if current_session:
                buffer_before = getattr(config, 'buffer_before', config.buffer_size)
                buffer_after = getattr(config, 'buffer_after', config.buffer_size)
                display_session(current_session, buffer_before, buffer_after)
    
    def on_prev_clicked(b):
        navigator.prev_session()
        session_selector.value = navigator.current_idx
        update_display()
    
    def on_next_clicked(b):
        navigator.next_session()
        session_selector.value = navigator.current_idx
        update_display()
    
    def on_random_clicked(b):
        navigator.random_session()
        session_selector.value = navigator.current_idx
        update_display()
    
    def on_refresh_clicked(b):
        update_display()
    
    def on_session_selected(change):
        navigator.goto_session(change.new)
        update_display()
    
    # Connect button callbacks
    prev_btn.on_click(on_prev_clicked)
    next_btn.on_click(on_next_clicked)
    random_btn.on_click(on_random_clicked)
    refresh_btn.on_click(on_refresh_clicked)
    session_selector.observe(on_session_selected, names='value')
    
    # Layout
    button_row = widgets.HBox([prev_btn, next_btn, random_btn, refresh_btn])
    controls = widgets.VBox([session_selector, button_row, output])
    
    # Initial display
    update_display()
    
    return controls

# Create and display controls
if navigator is not None:
    controls = create_navigation_controls()
    display(controls)
else:
    print("⚠️  Load data first to enable navigation controls")


VBox(children=(Dropdown(description='Jump to:', options=(('Session 1: session_1', 0), ('Session 2: session_2',…

In [28]:
def load_different_file():
    files, file_names = get_available_files()
    
    if not files:
        print("No files available!")
        return
    
    file_selector = widgets.Dropdown(
        options=[(name, path) for name, path in zip(file_names, files)],
        description='Select file:',
        style={'description_width': 'initial'}
    )
    
    load_btn = widgets.Button(description="Load File", button_style='primary')
    output = widgets.Output()
    
    def on_load_clicked(b):
        global df, sessions, has_ground_truth, navigator
        
        with output:
            clear_output(wait=True)
            selected_file = file_selector.value
            df, sessions, has_ground_truth = load_session_data(selected_file)
            
            if df is not None:
                navigator = SessionNavigator(sessions, df)
                print(f"Successfully loaded {os.path.basename(selected_file)}")
                print("You can now use the navigation controls above!")
            else:
                print("Failed to load file")
    
    load_btn.on_click(on_load_clicked)
    
    controls = widgets.VBox([file_selector, load_btn, output])
    display(controls)

load_different_file()


VBox(children=(Dropdown(description='Select file:', options=(('session_detection_results_20250707_190642.xlsx'…

In [29]:
def show_session_stats():
    if df is None:
        print("No data loaded!")
        return
    
    # Auto-detect fields
    content_field = get_content_field(df)
    role_field = get_role_field(df)
    
    print("SESSION STATISTICS")
    print("=" * 50)
    
    # Basic stats
    print(f"Total messages: {len(df)}")
    print(f"Total sessions: {len(sessions)}")
    print(f"Unique channels: {df['gpt_channel_id'].nunique()}")
    print(f"Average messages per session: {len(df) / len(sessions):.1f}")
    print(f"Content field: {content_field}")
    print(f"Role field: {role_field}")
    print()
    
    # Session length distribution
    session_lengths = df.groupby('session_id').size()
    print("Session length distribution:")
    print(session_lengths.describe())
    print()
    
    # Session length bins
    print("Session length bins:")
    bins = [1, 2, 3, 5, 10, 20, 50, 100]
    for i in range(len(bins) - 1):
        count = ((session_lengths >= bins[i]) & (session_lengths < bins[i+1])).sum()
        print(f"  {bins[i]}-{bins[i+1]-1} messages: {count} sessions")
    count = (session_lengths >= bins[-1]).sum()
    print(f"  {bins[-1]}+ messages: {count} sessions")
    print()
    
    # Role distribution using detected field
    if role_field and role_field in df.columns:
        print("Role distribution:")
        role_counts = df[role_field].value_counts()
        for role, count in role_counts.items():
            print(f"  {role}: {count} messages ({count/len(df)*100:.1f}%)")
        print()
    
    # Check for content availability
    if content_field and content_field in df.columns:
        non_empty_content = df[content_field].notna().sum()
        print(f"Messages with content: {non_empty_content} ({non_empty_content/len(df)*100:.1f}%)")
        print()
    
    # Prediction stats
    if 'is_session_start_pred' in df.columns:
        pred_starts = df['is_session_start_pred'].sum()
        print(f"Predicted session starts: {pred_starts} ({pred_starts/len(df)*100:.1f}%)")
    
    # Ground truth comparison
    if has_ground_truth and 'is_session_start' in df.columns:
        gt_numeric = df['is_session_start'].apply(
            lambda x: 1 if (pd.notna(x) and x in [1, 1.0, '[START]']) else 0
        )
        gt_starts = gt_numeric.sum()
        print(f"Ground truth session starts: {gt_starts} ({gt_starts/len(df)*100:.1f}%)")
        
        if 'is_session_start_pred' in df.columns:
            pred_numeric = df['is_session_start_pred'].fillna(0).astype(int)
            matches = (gt_numeric == pred_numeric).sum()
            accuracy = matches / len(df)
            print(f"Overall accuracy: {accuracy:.2%} ({matches}/{len(df)} correct)")
    
    print("=" * 50)

# Show stats if data is loaded
show_session_stats()


SESSION STATISTICS
Total messages: 2917
Total sessions: 657
Unique channels: 50
Average messages per session: 4.4
Content field: text
Role field: None

Session length distribution:
count    657.000000
mean       4.439878
std       10.491277
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max      147.000000
dtype: float64

Session length bins:
  1-1 messages: 476 sessions
  2-2 messages: 39 sessions
  3-4 messages: 29 sessions
  5-9 messages: 30 sessions
  10-19 messages: 40 sessions
  20-49 messages: 36 sessions
  50-99 messages: 6 sessions
  100+ messages: 1 sessions

Messages with content: 2900 (99.4%)

Predicted session starts: 657 (22.5%)
Ground truth session starts: 635 (21.8%)
Overall accuracy: 95.13% (2775/2917 correct)


In [30]:
def search_sessions():
    if df is None:
        print("No data loaded!")
        return
    
    # Auto-detect fields
    content_field = get_content_field(df)
    role_field = get_role_field(df)
    
    # Search controls
    search_text = widgets.Text(
        value='',
        placeholder='Search in messages...',
        description='Search:',
        style={'description_width': 'initial'}
    )
    
    min_length = widgets.IntSlider(
        value=1,
        min=1,
        max=50,
        description='Min length:',
        style={'description_width': 'initial'}
    )
    
    max_length = widgets.IntSlider(
        value=100,
        min=1,
        max=100,
        description='Max length:',
        style={'description_width': 'initial'}
    )
    
    # Use detected role field for options
    role_options = ['All']
    if role_field and role_field in df.columns:
        role_options.extend(list(df[role_field].unique()))
    
    role_filter = widgets.Dropdown(
        options=role_options,
        value='All',
        description='Role:',
        style={'description_width': 'initial'}
    )
    
    search_btn = widgets.Button(description="Search", button_style='primary')
    output = widgets.Output()
    
    def on_search_clicked(b):
        with output:
            clear_output(wait=True)
            
            print(f"Searching using content field: {content_field}")
            print(f"Searching using role field: {role_field}")
            print()
            
            # Get session lengths
            session_lengths = df.groupby('session_id').size()
            
            # Filter by length
            length_filter = ((session_lengths >= min_length.value) & 
                           (session_lengths <= max_length.value))
            valid_sessions = session_lengths[length_filter].index.tolist()
            
            # Filter by role if specified
            if role_filter.value != 'All' and role_field:
                role_sessions = df[df[role_field] == role_filter.value]['session_id'].unique()
                valid_sessions = [s for s in valid_sessions if s in role_sessions]
            
            # Filter by search text if provided
            if search_text.value.strip() and content_field:
                search_term = search_text.value.strip().lower()
                text_sessions = df[df[content_field].str.lower().str.contains(search_term, na=False)]['session_id'].unique()
                valid_sessions = [s for s in valid_sessions if s in text_sessions]
            
            print(f"Found {len(valid_sessions)} sessions matching criteria:")
            print()
            
            # Show first few results
            for i, session_id in enumerate(valid_sessions[:10]):
                session_data = df[df['session_id'] == session_id]
                length = len(session_data)
                channel = session_data['gpt_channel_id'].iloc[0]
                
                print(f"{i+1}. {session_id} (length: {length}, channel: {channel})")
                
                # Show sample message if search text was provided
                if search_text.value.strip() and content_field:
                    search_term = search_text.value.strip().lower()
                    matching_rows = session_data[session_data[content_field].str.lower().str.contains(search_term, na=False)]
                    if not matching_rows.empty:
                        matching_msg = matching_rows[content_field].iloc[0]
                        print(f"   Sample: {truncate_message(matching_msg, 100)}")
                print()
            
            if len(valid_sessions) > 10:
                print(f"... and {len(valid_sessions) - 10} more sessions")
    
    search_btn.on_click(on_search_clicked)
    
    controls = widgets.VBox([
        search_text,
        widgets.HBox([min_length, max_length]),
        role_filter,
        search_btn,
        output
    ])
    
    display(controls)

search_sessions()


VBox(children=(Text(value='', description='Search:', placeholder='Search in messages...', style=TextStyle(desc…