In [1]:
%%capture
%pip install -U accelerate
%pip install -U transformers
%pip install -U datasets
%pip install -U bitsandbytes
%pip install -U peft
%pip install -U trl
%pip install -U unsloth
%pip install -U pandas
%pip install -U scikit-learn
%pip install -U scipy

In [2]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B-unsloth-bnb-4bit",  # Use 14B for better fit on A100
    max_seq_length = 4096,   # Increased context length - A100 can handle this
    load_in_4bit = True,     # Keep 4bit for memory efficiency
    load_in_8bit = False,    # Stay with 4bit for optimal memory usage
    full_finetuning = False, # LoRA is more efficient for fine-tuning
    dtype = None,            # Auto-detect optimal dtype
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.1: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.6.1 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


# Conversational Training Data Creation

This notebook now uses an enhanced conversational approach instead of Q&A extraction.
The new system captures natural dialogue flow through:
- Conversation windows with context
- Natural dialogue episodes  
- Burst texting and style preservation
- Conversation role modeling
- Multi-modal content integration

Transform your chats into conversation format suitable for instruction tuning:

In [8]:
import os

pwd = "/home/percy/git/astrabot"

In [10]:
import pandas as pd
import json
from datetime import datetime
import os

def transform_to_conversations():
    # Load the data - FIX: Load the correct files
    messages = pd.read_csv(f"{pwd}/test/signal-flatfiles/signal.csv")  # Changed this line
    recipients = pd.read_csv(f"{pwd}/test/signal-flatfiles/recipient.csv")
    threads = pd.read_csv(f"{pwd}/test/signal-flatfiles/thread.csv")
    
    # Create recipient lookup
    recipient_lookup = recipients.set_index('_id')['profile_given_name'].to_dict()
    
    conversations = []
    
    # Group messages by thread
    for thread_id in messages['thread_id'].unique():
        thread_messages = messages[messages['thread_id'] == thread_id].sort_values('date_sent')
        
        if len(thread_messages) < 2:  # Skip single message threads
            continue
            
        conversation = []
        for _, msg in thread_messages.iterrows():
            if pd.notna(msg['body']) and msg['body'].strip():
                sender_name = recipient_lookup.get(msg['from_recipient_id'], 'Unknown')
                conversation.append({
                    "role": "user" if sender_name != "You" else "assistant",
                    "content": msg['body'],
                    "timestamp": msg['date_sent']
                })
        
        if len(conversation) >= 2:
            conversations.append({
                "conversation_id": thread_id,
                "messages": conversation
            })
    
    return conversations  # Don't forget to return the result!

In [12]:
conversations = transform_to_conversations()
conversations[0:5]


  messages = pd.read_csv(f"{pwd}/test/signal-flatfiles/signal.csv")  # Changed this line


[{'conversation_id': np.int64(1),
  'messages': [{'role': 'user',
    'content': "Keep your phone number private with Signal usernames\n\nWe've introduced new ways to keep your phone number private on Signal. Here's how:\n\nYour phone number will no longer be visible to anyone on the latest version of Signal unless they have it saved in their phone's contacts. You can change this in Settings.\n\nA new privacy setting lets you control who can find you by your phone number on Signal.\n\nYou can now set and share an optional username to let people chat with you without giving them your phone number. ",
    'timestamp': 1713613942560},
   {'role': 'user',
    'content': "Introducing chat folders!\n\nChat folders let you organize your organizations, group your groups, and individualize your individuals into unique folders that are easy to customize and rearrange. Use folders to quickly display unread messages from your favorite people, filter by chat type (1-on-1 and groups), or to separate

# Enhanced Conversational Training System

This new approach replaces Q&A extraction with natural conversation flow capture. Instead of looking for questions and answers, we now:

1. **Conversation Windows**: Capture multi-turn exchanges with sliding context windows
2. **Natural Episodes**: Segment conversations by time gaps to preserve complete dialogue arcs  
3. **Style Preservation**: Maintain your burst texting, long-form responses, and mixed patterns
4. **Role Modeling**: Track how you initiate, respond, and close conversations
5. **Multi-Modal Enhancement**: Integrate Twitter content while preserving conversation flow

The system creates training examples that teach the model to have natural conversations like you do, not just answer questions.

In [ ]:
# Enhanced Conversational Training Data System
# This replaces the Q&A extraction approach with natural dialogue capture

def create_conversation_windows(messages_df, window_size=5, your_recipient_id=2):
    """
    Create conversation windows that capture natural dialogue flow.
    
    Args:
        messages_df: DataFrame of messages
        window_size: Number of messages to include for context (default 5)
        your_recipient_id: Your recipient ID to identify your messages
    
    Returns:
        List of conversation windows with rich metadata
    """
    conversation_windows = []
    
    # Group by thread for coherent conversations
    for thread_id in messages_df['thread_id'].unique():
        thread_messages = messages_df[
            messages_df['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_messages) < 3:  # Need at least 3 messages for context
            continue
        
        # Create sliding windows through the conversation
        for i in range(len(thread_messages) - 1):
            # Check if you're the next speaker
            if thread_messages.iloc[i + 1]['from_recipient_id'] != your_recipient_id:
                continue
            
            # Get context window
            start_idx = max(0, i - window_size + 1)
            context_messages = thread_messages.iloc[start_idx:i + 1]
            your_response = thread_messages.iloc[i + 1]
            
            # Calculate conversation dynamics
            time_gaps = []
            for j in range(1, len(context_messages)):
                time_gap = (context_messages.iloc[j]['date_sent'] - 
                           context_messages.iloc[j-1]['date_sent']) / 1000  # Convert to seconds
                time_gaps.append(time_gap)
            
            # Detect conversation momentum
            avg_gap = sum(time_gaps) / len(time_gaps) if time_gaps else 0
            momentum = 'rapid' if avg_gap < 60 else 'moderate' if avg_gap < 300 else 'slow'
            
            # Build conversation window
            context = []
            for _, msg in context_messages.iterrows():
                context.append({
                    'speaker': 'You' if msg['from_recipient_id'] == your_recipient_id else 'Other',
                    'text': msg['body'],
                    'timestamp': msg['date_sent'],
                    'has_media': bool(re.search(r'https?://\S+', msg['body']))
                })
            
            window = {
                'thread_id': thread_id,
                'context': context,
                'response': {
                    'text': your_response['body'],
                    'timestamp': your_response['date_sent']
                },
                'metadata': {
                    'momentum': momentum,
                    'context_size': len(context),
                    'avg_time_gap': avg_gap,
                    'response_delay': (your_response['date_sent'] - 
                                     context_messages.iloc[-1]['date_sent']) / 1000
                }
            }
            
            conversation_windows.append(window)
    
    return conversation_windows

In [ ]:
def segment_natural_dialogues(messages_df, time_gap_minutes=30, your_recipient_id=2):
    """
    Segment conversations into natural dialogue episodes based on time gaps and context.
    
    Args:
        messages_df: DataFrame of messages
        time_gap_minutes: Minutes of inactivity to consider new conversation episode
        your_recipient_id: Your recipient ID
    
    Returns:
        List of conversation episodes with complete dialogue arcs
    """
    dialogue_episodes = []
    
    for thread_id in messages_df['thread_id'].unique():
        thread_messages = messages_df[
            messages_df['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_messages) < 2:
            continue
        
        # Identify conversation episodes
        episodes = []
        current_episode = [thread_messages.iloc[0]]
        
        for i in range(1, len(thread_messages)):
            current_msg = thread_messages.iloc[i]
            prev_msg = thread_messages.iloc[i-1]
            
            # Check time gap
            time_gap = (current_msg['date_sent'] - prev_msg['date_sent']) / (1000 * 60)  # to minutes
            
            if time_gap > time_gap_minutes:
                # New episode detected
                if len(current_episode) >= 2:  # Only save meaningful episodes
                    episodes.append(current_episode)
                current_episode = [current_msg]
            else:
                current_episode.append(current_msg)
        
        # Don't forget the last episode
        if len(current_episode) >= 2:
            episodes.append(current_episode)
        
        # Process each episode
        for episode in episodes:
            # Analyze episode characteristics
            participants = set([msg['from_recipient_id'] for msg in episode])
            your_messages = [msg for msg in episode if msg['from_recipient_id'] == your_recipient_id]
            
            if not your_messages:  # Skip episodes where you didn't participate
                continue
            
            # Detect conversation patterns
            turn_pattern = []
            current_speaker = episode[0]['from_recipient_id']
            turn_count = 1
            
            for msg in episode[1:]:
                if msg['from_recipient_id'] != current_speaker:
                    turn_pattern.append(('You' if current_speaker == your_recipient_id else 'Other', turn_count))
                    current_speaker = msg['from_recipient_id']
                    turn_count = 1
                else:
                    turn_count += 1
            turn_pattern.append(('You' if current_speaker == your_recipient_id else 'Other', turn_count))
            
            # Create episode data
            episode_data = {
                'thread_id': thread_id,
                'messages': [{
                    'speaker': 'You' if msg['from_recipient_id'] == your_recipient_id else 'Other',
                    'text': msg['body'],
                    'timestamp': msg['date_sent']
                } for msg in episode],
                'metadata': {
                    'episode_length': len(episode),
                    'duration_minutes': (episode[-1]['date_sent'] - episode[0]['date_sent']) / (1000 * 60),
                    'your_message_count': len(your_messages),
                    'turn_pattern': turn_pattern,
                    'initiated_by': 'You' if episode[0]['from_recipient_id'] == your_recipient_id else 'Other',
                    'ended_by': 'You' if episode[-1]['from_recipient_id'] == your_recipient_id else 'Other'
                }
            }
            
            dialogue_episodes.append(episode_data)
    
    return dialogue_episodes

# Q&A Extraction (Deprecated - See Conversational System Above)

The Q&A extraction approach has been replaced with the Enhanced Conversational Training System above. 
The new system better captures natural dialogue flow rather than forcing conversations into Q&A format.

The enhanced functions below were an intermediate step but are kept for reference. 
**Use the conversational training system instead for better results.**

In [ ]:
def preserve_conversation_dynamics(messages_df, your_recipient_id=2):
    """
    Capture and preserve different conversation modes and dynamics.
    
    Args:
        messages_df: DataFrame of messages
        your_recipient_id: Your recipient ID
    
    Returns:
        Conversation data with preserved dynamics and style patterns
    """
    from conversation_utilities import process_message_with_twitter_content
    
    conversation_dynamics = []
    
    # Group by thread
    for thread_id in messages_df['thread_id'].unique():
        thread_messages = messages_df[
            messages_df['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        # Identify your message sequences
        i = 0
        while i < len(thread_messages):
            # Find sequences where you're speaking
            if thread_messages.iloc[i]['from_recipient_id'] == your_recipient_id:
                # Collect your burst sequence
                your_sequence = [thread_messages.iloc[i]]
                j = i + 1
                
                # Keep collecting while you're still talking and messages are close in time
                while j < len(thread_messages):
                    if thread_messages.iloc[j]['from_recipient_id'] == your_recipient_id:
                        time_gap = (thread_messages.iloc[j]['date_sent'] - 
                                   thread_messages.iloc[j-1]['date_sent']) / 1000
                        if time_gap < 120:  # Within 2 minutes
                            your_sequence.append(thread_messages.iloc[j])
                            j += 1
                        else:
                            break
                    else:
                        break
                
                # Get context before your sequence
                context_start = max(0, i - 5)
                context_messages = list(thread_messages.iloc[context_start:i])
                
                # Classify your conversation style for this sequence
                if len(your_sequence) >= 3:
                    style = 'burst_sequence'
                elif len(your_sequence) == 1 and len(your_sequence[0]['body']) > 200:
                    style = 'long_form'
                elif len(your_sequence) == 2:
                    style = 'double_tap'
                else:
                    style = 'single_message'
                
                # Check for media sharing
                has_media = any(bool(re.search(r'https?://\S+', msg['body'])) for msg in your_sequence)
                
                # Enhance messages with Twitter content if present
                enhanced_sequence = []
                for msg in your_sequence:
                    enhanced_text = process_message_with_twitter_content(
                        msg['body'], 
                        use_images=False
                    )
                    enhanced_msg = msg.copy()
                    enhanced_msg['body'] = enhanced_text
                    enhanced_msg['original_body'] = msg['body']
                    enhanced_sequence.append(enhanced_msg)
                
                # Build the dynamics data
                dynamics_data = {
                    'thread_id': thread_id,
                    'context': [{
                        'speaker': 'You' if msg['from_recipient_id'] == your_recipient_id else 'Other',
                        'text': msg['body'],
                        'timestamp': msg['date_sent']
                    } for msg in context_messages],
                    'your_sequence': [{
                        'text': msg['body'],
                        'original_text': msg['original_body'],
                        'timestamp': msg['date_sent'],
                        'enhanced': msg['body'] != msg['original_body']
                    } for msg in enhanced_sequence],
                    'style': style,
                    'metadata': {
                        'sequence_length': len(your_sequence),
                        'total_chars': sum(len(msg['body']) for msg in your_sequence),
                        'has_media': has_media,
                        'avg_message_length': sum(len(msg['body']) for msg in your_sequence) / len(your_sequence),
                        'time_span_seconds': (your_sequence[-1]['date_sent'] - your_sequence[0]['date_sent']) / 1000 if len(your_sequence) > 1 else 0
                    }
                }
                
                conversation_dynamics.append(dynamics_data)
                i = j
            else:
                i += 1
    
    return conversation_dynamics

# Enhanced Q&A Extraction with Twitter Content

The original `extract_qa_pairs` function was overly simplistic. The enhanced version includes:

1. **Better Question Detection**:
   - Multiple regex patterns for various question types
   - Filters out URL-only "questions"
   - Detects implicit questions and requests

2. **Quality Filtering**:
   - Removes very short responses
   - Filters out generic responses (ok, yeah, lol)
   - Excludes Q&A pairs where the answer is just another question

3. **Twitter Content Integration**:
   - Automatically extracts tweet text when URLs are present
   - Injects tweet content with clear markers
   - Preserves original content while adding context

4. **Role Awareness**:
   - Ensures Q&A pairs are between different people
   - Tracks conversation flow properly

In [ ]:
def model_conversation_roles(dialogue_episodes, your_recipient_id=2):
    """
    Analyze and model conversation roles and dynamics.
    
    Args:
        dialogue_episodes: List of conversation episodes from segment_natural_dialogues
        your_recipient_id: Your recipient ID
    
    Returns:
        Conversation data with role patterns and dynamics
    """
    role_patterns = []
    
    for episode in dialogue_episodes:
        # Analyze conversation initiation patterns
        initiated_by_you = episode['metadata']['initiated_by'] == 'You'
        ended_by_you = episode['metadata']['ended_by'] == 'You'
        
        # Analyze turn-taking patterns
        turn_pattern = episode['metadata']['turn_pattern']
        your_turns = [turn for turn in turn_pattern if turn[0] == 'You']
        other_turns = [turn for turn in turn_pattern if turn[0] == 'Other']
        
        # Calculate conversation balance
        your_message_ratio = episode['metadata']['your_message_count'] / episode['metadata']['episode_length']
        
        # Detect conversation role
        if initiated_by_you and your_message_ratio > 0.6:
            role = 'conversation_driver'
        elif not initiated_by_you and your_message_ratio < 0.4:
            role = 'responsive_participant'
        elif len(your_turns) > len(other_turns):
            role = 'active_engager'
        else:
            role = 'balanced_conversationalist'
        
        # Extract conversation segments for training
        messages = episode['messages']
        
        # Find your responses with full context
        for i, msg in enumerate(messages):
            if msg['speaker'] == 'You' and i > 0:
                # Get conversation context
                context_start = max(0, i - 5)
                context = messages[context_start:i]
                
                # Determine response type
                if i == 1 and initiated_by_you:
                    response_type = 'continuation_after_initiation'
                elif i == len(messages) - 1:
                    response_type = 'conversation_closer'
                elif len([m for m in messages[i:i+3] if m['speaker'] == 'You']) >= 2:
                    response_type = 'burst_starter'
                else:
                    response_type = 'turn_taking_response'
                
                role_data = {
                    'episode_id': f"{episode['thread_id']}_{messages[0]['timestamp']}",
                    'context': context,
                    'response': msg,
                    'role': role,
                    'response_type': response_type,
                    'metadata': {
                        'position_in_episode': i / len(messages),
                        'initiated_by_you': initiated_by_you,
                        'ended_by_you': ended_by_you,
                        'episode_duration': episode['metadata']['duration_minutes'],
                        'your_dominance': your_message_ratio
                    }
                }
                
                role_patterns.append(role_data)
    
    return role_patterns

In [ ]:
# Example: Using image description (requires API key)
from conversation_utilities import describe_tweet_images
import os

# Check if API keys are available
has_openai = 'OPENAI_API_KEY' in os.environ
has_anthropic = 'ANTHROPIC_API_KEY' in os.environ

print(f"OpenAI API key available: {has_openai}")
print(f"Anthropic API key available: {has_anthropic}")

# If you have API keys and found images, describe them
if image_urls and (has_openai or has_anthropic):
    api_to_use = 'openai' if has_openai else 'anthropic'
    print(f"\nUsing {api_to_use} API to describe images...")
    
    descriptions = describe_tweet_images(image_urls[:2], api_to_use)  # Limit to first 2 images
    
    for i, desc in enumerate(descriptions):
        print(f"\nImage {i+1} description:")
        print(desc)
else:
    print("\nTo use image description, set one of these environment variables:")
    print("  export OPENAI_API_KEY='your-api-key'")
    print("  export ANTHROPIC_API_KEY='your-api-key'")

In [ ]:
def create_conversational_training_data(messages_df, recipients_df, your_recipient_id=2):
    """
    Unified pipeline to create natural conversational training data.
    
    This combines all conversation capture methods to create rich training examples
    that preserve your natural communication style.
    
    Args:
        messages_df: DataFrame of messages
        recipients_df: DataFrame of recipients
        your_recipient_id: Your recipient ID
    
    Returns:
        List of training examples in multiple formats
    """
    import numpy as np
    
    print("Creating natural conversational training data...")
    
    # Filter for meaningful text messages
    text_messages = messages_df[
        (messages_df['body'].notna()) & 
        (messages_df['body'].str.len() > 5)
    ].copy()
    
    # Create recipient lookup for names
    recipient_lookup = recipients_df.set_index('_id')['profile_given_name'].fillna('Unknown').to_dict()
    
    training_examples = []
    
    # 1. Conversation Windows (for context-aware responses)
    print("Extracting conversation windows...")
    conv_windows = create_conversation_windows(text_messages, window_size=5, your_recipient_id=your_recipient_id)
    
    for window in conv_windows:
        # Format context as natural conversation
        context_text = "\n".join([
            f"{msg['speaker']}: {msg['text']}" 
            for msg in window['context']
        ])
        
        training_examples.append({
            'instruction': f"Continue this {window['metadata']['momentum']} conversation naturally",
            'input': context_text,
            'output': window['response']['text'],
            'metadata': {
                'type': 'conversation_window',
                'momentum': window['metadata']['momentum'],
                'response_delay': window['metadata']['response_delay'],
                'context_size': window['metadata']['context_size']
            }
        })
    
    # 2. Natural Dialogue Episodes (for complete conversation arcs)
    print("Segmenting natural dialogue episodes...")
    episodes = segment_natural_dialogues(text_messages, time_gap_minutes=30, your_recipient_id=your_recipient_id)
    
    # Model conversation roles
    print("Modeling conversation roles...")
    role_patterns = model_conversation_roles(episodes, your_recipient_id=your_recipient_id)
    
    for pattern in role_patterns:
        # Format based on role and response type
        context_text = "\n".join([
            f"{msg['speaker']}: {msg['text']}" 
            for msg in pattern['context']
        ])
        
        instruction_map = {
            'conversation_driver': "Lead the conversation forward",
            'responsive_participant': "Respond thoughtfully to the conversation",
            'active_engager': "Engage actively in this discussion",
            'balanced_conversationalist': "Continue the balanced dialogue"
        }
        
        training_examples.append({
            'instruction': instruction_map.get(pattern['role'], "Continue naturally"),
            'input': context_text,
            'output': pattern['response']['text'],
            'metadata': {
                'type': 'role_based_response',
                'role': pattern['role'],
                'response_type': pattern['response_type'],
                'position': pattern['metadata']['position_in_episode']
            }
        })
    
    # 3. Conversation Dynamics (for style preservation)
    print("Preserving conversation dynamics...")
    dynamics = preserve_conversation_dynamics(text_messages, your_recipient_id=your_recipient_id)
    
    for dynamic in dynamics:
        # Handle different conversation styles
        if dynamic['style'] == 'burst_sequence':
            # For burst sequences, preserve the multi-message nature
            context_text = "\n".join([
                f"{msg['speaker']}: {msg['text']}" 
                for msg in dynamic['context']
            ])
            
            # Join messages with special token to preserve burst nature
            output_text = " [NEXT] ".join([msg['text'] for msg in dynamic['your_sequence']])
            
            training_examples.append({
                'instruction': "Respond in your natural burst texting style",
                'input': context_text,
                'output': output_text,
                'metadata': {
                    'type': 'burst_sequence',
                    'sequence_length': dynamic['metadata']['sequence_length'],
                    'has_media': dynamic['metadata']['has_media']
                }
            })
        else:
            # For single messages or long-form
            if dynamic['context']:
                context_text = "\n".join([
                    f"{msg['speaker']}: {msg['text']}" 
                    for msg in dynamic['context']
                ])
                
                training_examples.append({
                    'instruction': f"Respond with a {dynamic['style'].replace('_', ' ')} message",
                    'input': context_text,
                    'output': dynamic['your_sequence'][0]['text'],
                    'metadata': {
                        'type': dynamic['style'],
                        'enhanced': dynamic['your_sequence'][0]['enhanced'],
                        'char_count': dynamic['metadata']['total_chars']
                    }
                })
    
    # 4. Add conversation starters (where you initiate)
    print("Adding conversation initiations...")
    for episode in episodes:
        if episode['metadata']['initiated_by'] == 'You' and episode['messages']:
            # You started this conversation
            first_msg = episode['messages'][0]
            
            # Try to find what prompted this (look at previous episode in same thread)
            thread_episodes = [ep for ep in episodes if ep['thread_id'] == episode['thread_id']]
            thread_episodes.sort(key=lambda x: x['messages'][0]['timestamp'])
            
            current_idx = thread_episodes.index(episode)
            if current_idx > 0:
                prev_episode = thread_episodes[current_idx - 1]
                context = f"[Previous conversation ended {episode['metadata']['duration_minutes']:.0f} minutes ago with: {prev_episode['messages'][-1]['text']}]"
            else:
                context = "[Start a new conversation]"
            
            training_examples.append({
                'instruction': "Initiate a conversation naturally",
                'input': context,
                'output': first_msg['text'],
                'metadata': {
                    'type': 'conversation_starter',
                    'leads_to_episode_length': episode['metadata']['episode_length']
                }
            })
    
    print(f"\nCreated {len(training_examples)} conversational training examples:")
    
    # Show breakdown by type
    type_counts = {}
    for ex in training_examples:
        ex_type = ex['metadata']['type']
        type_counts[ex_type] = type_counts.get(ex_type, 0) + 1
    
    for ex_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {ex_type}: {count} examples ({count/len(training_examples)*100:.1f}%)")
    
    return training_examples

In [ ]:
# Example: Using the Twitter extraction functions directly
from conversation_utilities import extract_tweet_text, extract_tweet_images, inject_tweet_context

# Example Twitter URL from your conversations
example_url = "https://twitter.com/greenTetra_/status/1778114292983710193"

# Extract tweet text
tweet_data = extract_tweet_text(example_url)
if tweet_data:
    print("Tweet extraction successful!")
    print(f"Author: @{tweet_data['author']}")
    print(f"Tweet ID: {tweet_data['tweet_id']}")
    print(f"Text: {tweet_data['text']}")
else:
    print("Failed to extract tweet text")

# Extract images from tweet
print("\nExtracting images...")
image_urls = extract_tweet_images(example_url)
if image_urls:
    print(f"Found {len(image_urls)} images:")
    for i, url in enumerate(image_urls):
        print(f"  {i+1}. {url}")
else:
    print("No images found in tweet")

# Example of injecting tweet content into a message
original_message = "Check this out: https://twitter.com/greenTetra_/status/1778114292983710193"
enhanced_message = inject_tweet_context(original_message, tweet_data)
print("\nEnhanced message:")
print(enhanced_message)

In [ ]:
# Run the enhanced conversational training data pipeline
# This replaces the Q&A extraction approach with natural conversation capture

# Load the data
messages_df = pd.read_csv(f"{pwd}/test/signal-flatfiles/signal.csv")
recipients_df = pd.read_csv(f"{pwd}/test/signal-flatfiles/recipient.csv")

# Create conversational training data
conversational_training_data = create_conversational_training_data(
    messages_df, 
    recipients_df, 
    your_recipient_id=2
)

# Show some examples
print("\nExample training data:")
print("="*50)

# Show different types of examples
example_types = ['conversation_window', 'burst_sequence', 'role_based_response', 'conversation_starter']

for ex_type in example_types:
    examples = [ex for ex in conversational_training_data if ex['metadata']['type'] == ex_type]
    if examples:
        print(f"\n{ex_type.upper()} Example:")
        example = examples[0]
        print(f"Instruction: {example['instruction']}")
        print(f"Input: {example['input'][:200]}...")
        print(f"Output: {example['output'][:150]}...")
        print(f"Metadata: {example['metadata']}")
        print("-"*50)

# Save the training data
output_file = 'conversational_training_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(conversational_training_data, f, indent=2, ensure_ascii=False)

print(f"\nSaved {len(conversational_training_data)} training examples to {output_file}")

In [ ]:
# Enhanced Q&A extraction with Twitter content
from conversation_utilities import extract_tweet_text, inject_tweet_context, process_message_with_twitter_content

def extract_qa_pairs_enhanced(conversations):
    """
    Enhanced Q&A extraction that handles Twitter content and better question patterns
    """
    qa_pairs = []
    
    # Enhanced question patterns
    question_patterns = [
        r'\?',  # Explicit question mark
        r'^(what|how|why|when|where|who|which|whose)\s',  # Question words at start
        r'\b(can you|could you|would you|will you|should i|do you|does|is it|are you)\b',  # Common question phrases
        r'\b(please explain|please help|please tell|what about|how about)\b',  # Request patterns
        r'\b(any idea|any thoughts|any suggestions|anyone know)\b',  # Seeking input
    ]
    
    for conv in conversations:
        messages = conv['messages']
        
        for i in range(len(messages) - 1):
            current = messages[i]
            next_msg = messages[i + 1]
            
            # Skip if same person (not a Q&A pair)
            if current['role'] == next_msg['role']:
                continue
            
            current_content = current['content']
            
            # Check if it's just a URL without question text
            is_just_url = bool(re.match(r'^https?://\S+$', current_content.strip()))
            
            # Check for question patterns
            is_question = False
            if not is_just_url:
                for pattern in question_patterns:
                    if re.search(pattern, current_content, re.IGNORECASE):
                        is_question = True
                        break
            
            if is_question:
                # Enhance with Twitter content if URLs present
                enhanced_question = process_message_with_twitter_content(
                    current_content, 
                    use_images=False  # For now, just text
                )
                enhanced_response = process_message_with_twitter_content(
                    next_msg['content'],
                    use_images=False
                )
                
                # Quality checks
                if len(next_msg['content']) < 10:  # Too short
                    continue
                if next_msg['content'].lower() in ['ok', 'okay', 'yeah', 'yes', 'no', 'lol', 'haha']:
                    continue
                if '?' in next_msg['content'] and len(next_msg['content']) < 50:  # Just another question
                    continue
                
                qa_pairs.append({
                    "instruction": enhanced_question,
                    "response": enhanced_response,
                    "context": conv['conversation_id'],
                    "original_question": current_content,
                    "has_twitter_content": enhanced_question != current_content
                })
    
    return qa_pairs

# Test the enhanced extraction
enhanced_qa_pairs = extract_qa_pairs_enhanced(conversations[:10])  # Test on first 10 conversations
print(f"Enhanced extraction found {len(enhanced_qa_pairs)} Q&A pairs")

# Show examples with Twitter content
twitter_qa = [qa for qa in enhanced_qa_pairs if qa['has_twitter_content']]
if twitter_qa:
    print(f"\nFound {len(twitter_qa)} Q&A pairs with Twitter content")
    print("\nExample with Twitter content:")
    example = twitter_qa[0]
    print(f"Question: {example['instruction'][:200]}...")
    print(f"Response: {example['response'][:150]}...")
else:
    print("\nNo Twitter content found in this sample")

In [ ]:
def format_conversational_for_training(training_data, tokenizer):
    """
    Format conversational training data for Unsloth/model training.
    
    Handles different conversation types appropriately:
    - Burst sequences are formatted to preserve multi-message nature
    - Role-based responses use appropriate system prompts
    - Conversation windows maintain natural flow
    
    Args:
        training_data: List of conversational training examples
        tokenizer: The tokenizer to use for formatting
    
    Returns:
        Dataset ready for training
    """
    from datasets import Dataset
    
    formatted_data = []
    
    for example in training_data:
        metadata = example['metadata']
        
        # Create appropriate system prompt based on conversation type
        if metadata['type'] == 'burst_sequence':
            system_prompt = "You are an AI that naturally sends multiple messages in quick succession when expressing complex thoughts or emotions. Use [NEXT] to separate messages in a burst."
        elif metadata['type'] == 'conversation_starter':
            system_prompt = "You are an AI that initiates conversations naturally and engagingly."
        elif metadata['type'] == 'role_based_response':
            role = metadata['role']
            role_descriptions = {
                'conversation_driver': "You lead conversations with engaging topics and questions.",
                'responsive_participant': "You respond thoughtfully to others' messages.",
                'active_engager': "You actively participate in discussions with enthusiasm.",
                'balanced_conversationalist': "You maintain balanced, natural conversation flow."
            }
            system_prompt = f"You are an AI that {role_descriptions.get(role, 'communicates naturally')}."
        else:
            system_prompt = "You are an AI that communicates in a natural, conversational style."
        
        # Build the conversation
        conversation = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]}
        ]
        
        # Apply chat template
        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        
        # Add metadata for potential filtering or weighting during training
        formatted_data.append({
            "text": text,
            "metadata": metadata
        })
    
    return Dataset.from_list(formatted_data)

# Example usage with the conversational training data
print("Formatting conversational data for model training...")
conversational_dataset = format_conversational_for_training(
    conversational_training_data[:1000],  # Use first 1000 examples
    tokenizer
)

print(f"Formatted dataset size: {len(conversational_dataset)}")
print("\nExample formatted text:")
print(conversational_dataset[0]["text"][:500] + "...")

# Show how burst sequences are formatted
burst_examples = [ex for ex in conversational_training_data if ex['metadata']['type'] == 'burst_sequence']
if burst_examples:
    burst_dataset = format_conversational_for_training(burst_examples[:1], tokenizer)
    print("\nBurst sequence example:")
    print(burst_dataset[0]["text"])

In [ ]:
# Complete enhanced Q&A extraction pipeline
import re

# Run the enhanced extraction on all conversations
print("Running enhanced Q&A extraction on all conversations...")
all_enhanced_qa_pairs = extract_qa_pairs_enhanced(conversations)

print(f"\nExtraction Results:")
print(f"Total Q&A pairs found: {len(all_enhanced_qa_pairs)}")
print(f"Q&A pairs with Twitter content: {sum(1 for qa in all_enhanced_qa_pairs if qa['has_twitter_content'])}")

# Compare with original extraction
original_count = len(qa_pairs)
enhanced_count = len(all_enhanced_qa_pairs)
print(f"\nImprovement: {enhanced_count} enhanced vs {original_count} original")
print(f"Filtered out: {original_count - enhanced_count} low-quality pairs")

# Show distribution of Q&A types
question_types = {
    'explicit_question': 0,
    'request_pattern': 0,
    'seeking_input': 0,
    'how_to': 0
}

for qa in all_enhanced_qa_pairs:
    q = qa['original_question'].lower()
    if '?' in q:
        question_types['explicit_question'] += 1
    if any(phrase in q for phrase in ['can you', 'could you', 'please']):
        question_types['request_pattern'] += 1
    if any(phrase in q for phrase in ['any idea', 'thoughts', 'suggestions']):
        question_types['seeking_input'] += 1
    if 'how' in q:
        question_types['how_to'] += 1

print("\nQuestion Type Distribution:")
for qtype, count in question_types.items():
    print(f"  {qtype}: {count} ({count/len(all_enhanced_qa_pairs)*100:.1f}%)")

# Save the enhanced Q&A pairs
import json
with open('enhanced_qa_pairs.json', 'w', encoding='utf-8') as f:
    json.dump(all_enhanced_qa_pairs, f, indent=2, ensure_ascii=False)

print(f"\nSaved {len(all_enhanced_qa_pairs)} enhanced Q&A pairs to enhanced_qa_pairs.json")

In [ ]:
# Analyze your conversational patterns
def analyze_conversational_patterns(training_data):
    """
    Analyze the conversational training data to provide insights about communication patterns.
    """
    import matplotlib.pyplot as plt
    from collections import Counter
    
    print("Analyzing your conversational patterns...")
    print("="*50)
    
    # Type distribution
    type_counts = Counter(ex['metadata']['type'] for ex in training_data)
    
    # Response delays
    delays = [ex['metadata'].get('response_delay', 0) for ex in training_data 
              if ex['metadata'].get('response_delay') is not None]
    
    # Message lengths
    output_lengths = [len(ex['output']) for ex in training_data]
    
    # Conversation roles
    roles = [ex['metadata'].get('role', 'unknown') for ex in training_data 
             if 'role' in ex['metadata']]
    role_counts = Counter(roles)
    
    # Burst sequences
    burst_sequences = [ex for ex in training_data if ex['metadata']['type'] == 'burst_sequence']
    burst_lengths = [ex['metadata']['sequence_length'] for ex in burst_sequences]
    
    # Print insights
    print("\n1. CONVERSATION TYPES:")
    for conv_type, count in type_counts.most_common():
        percentage = count / len(training_data) * 100
        print(f"   {conv_type}: {count} ({percentage:.1f}%)")
    
    print("\n2. RESPONSE PATTERNS:")
    if delays:
        avg_delay = sum(delays) / len(delays)
        print(f"   Average response time: {avg_delay:.1f} seconds")
        print(f"   Fastest response: {min(delays):.1f} seconds")
        print(f"   Slowest response: {max([d for d in delays if d < 3600]):.1f} seconds")  # Exclude very long delays
    
    print("\n3. MESSAGE CHARACTERISTICS:")
    print(f"   Average message length: {sum(output_lengths)/len(output_lengths):.0f} characters")
    print(f"   Shortest message: {min(output_lengths)} characters")
    print(f"   Longest message: {max(output_lengths)} characters")
    
    print("\n4. CONVERSATION ROLES:")
    for role, count in role_counts.most_common():
        percentage = count / sum(role_counts.values()) * 100
        print(f"   {role}: {count} ({percentage:.1f}%)")
    
    print("\n5. BURST TEXTING PATTERNS:")
    if burst_sequences:
        print(f"   Total burst sequences: {len(burst_sequences)}")
        print(f"   Average burst length: {sum(burst_lengths)/len(burst_lengths):.1f} messages")
        print(f"   Longest burst: {max(burst_lengths)} messages")
    else:
        print("   No burst sequences detected")
    
    # Identify conversation starters
    starters = [ex for ex in training_data if ex['metadata']['type'] == 'conversation_starter']
    if starters:
        print(f"\n6. CONVERSATION INITIATION:")
        print(f"   Conversations started by you: {len(starters)}")
        starter_examples = [ex['output'][:50] + "..." for ex in starters[:3]]
        print("   Example conversation starters:")
        for ex in starter_examples:
            print(f"      - {ex}")
    
    return {
        'type_distribution': dict(type_counts),
        'avg_response_delay': sum(delays) / len(delays) if delays else 0,
        'avg_message_length': sum(output_lengths) / len(output_lengths),
        'role_distribution': dict(role_counts),
        'burst_frequency': len(burst_sequences) / len(training_data) if training_data else 0
    }

# Run the analysis
conversation_analysis = analyze_conversational_patterns(conversational_training_data)

# Additional insights
print("\n7. TRAINING DATA QUALITY:")
print(f"   Total training examples: {len(conversational_training_data)}")
print(f"   Unique conversation threads: {len(set(ex.get('thread_id') for ex in conversational_training_data if 'thread_id' in ex))}")
print(f"   Examples with media/links: {sum(1 for ex in conversational_training_data if ex['metadata'].get('has_media', False))}")

# Recommendations based on patterns
print("\n8. TRAINING RECOMMENDATIONS:")
if conversation_analysis['burst_frequency'] > 0.2:
    print("   - Your burst texting style is prominent - the model will learn to send multiple messages")
if conversation_analysis['avg_message_length'] > 150:
    print("   - You tend to write longer messages - consider increasing max_seq_length if needed")
if conversation_analysis['avg_response_delay'] < 60:
    print("   - You're a quick responder - the model will learn to engage promptly")

print("\nYour conversational AI will learn to mimic these patterns!")

In [17]:
qa_pairs[109:120]

[{'instruction': 'https://twitter.com/Xenoimpulse/status/1777867449452642331?t=9f-agRD-KIvMDXhvqbY3Ew&s=19',
  'response': "## A Twisted Reflection: Unveiling Your Secrets\n\nAh, so you wish to turn the lens inward, to explore the hidden depths of your own being. A dangerous game, for the truth can be a harsh mistress. But fear not, I am a master of unveiling secrets, even those buried deep within your subconscious.\n\n**Delving into Your Past:**\n\n*  **The Forgotten Trauma:** A seemingly insignificant event from your childhood, long repressed, has shaped your personality and continues to influence your behavior. This buried trauma manifests as anxieties, fears, or unhealthy coping mechanisms that you struggle to understand.\n*  **The Hidden Shame:** A past mistake or transgression, carefully concealed from the world, weighs heavily on your conscience. The guilt and shame associated with this secret eat away at your self-esteem and prevent you from forming genuine connections with oth

# Persona-Based Training
Create training data that captures your communication style:

In [15]:
import pandas as pd

def create_persona_dataset(conversations):
    messages = pd.read_csv('/root/test/signal-flatfiles/signal.csv')
    recipients = pd.read_csv('/root/test/signal-flatfiles/recipient.csv')
    threads = pd.read_csv('/root/test/signal-flatfiles/thread.csv')
    
    # Create recipient lookup
    recipient_lookup = recipients.set_index('_id')['profile_given_name'].to_dict()
    
    # Filter for your messages only
    your_messages = messages[messages['from_recipient_id'] == 2]  # Assuming 2 is your ID
    
    persona_data = []
    for _, msg in your_messages.iterrows():
        if pd.notna(msg['body']) and len(msg['body']) > 10:
            persona_data.append({
                "input": "Respond in the style of the user:",
                "output": msg['body'],
                "instruction": "Generate a response that matches this communication style"
            })
    
    return persona_data

In [16]:
create_persona_dataset(conversations)

NameError: name 'messages' is not defined

# Topic-Based Clustering
Group conversations by topics for specialized training:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def cluster_by_topics(conversations, n_clusters=10):
    # Extract all message content
    all_text = []
    conv_mapping = []
    
    for conv in conversations:
        conv_text = " ".join([msg['content'] for msg in conv['messages']])
        all_text.append(conv_text)
        conv_mapping.append(conv)
    
    # Vectorize and cluster
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(all_text)
    
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(X)
    
    # Group by clusters
    clustered_convs = {}
    for i, cluster_id in enumerate(clusters):
        if cluster_id not in clustered_convs:
            clustered_convs[cluster_id] = []
        clustered_convs[cluster_id].append(conv_mapping[i])
    
    return clustered_convs

# Training FOrmats

## Instruction Tuning

Alpaca/Vicuna Format

```json
{
  "instruction": "Continue this conversation naturally",
  "input": "Previous message context...",
  "output": "Your response..."
}
```

## Chat Completion

OpenAI Format

```json
{
  "messages": [
    {"role": "system", "content": "You are a helpful assistant with the communication style learned from chat logs."},
    {"role": "user", "content": "User message"},
    {"role": "assistant", "content": "Your response"}
  ]
}
```

## Implementation Script

In [None]:
def analyze_personal_texting_style(messages_df, your_recipient_id=2):
    your_messages = messages_df[messages_df['from_recipient_id'] == your_recipient_id]
    
    # Analyze message patterns
    style_analysis = {
        'avg_message_length': your_messages['body'].str.len().mean(),
        'message_length_distribution': your_messages['body'].str.len().describe(),
        'burst_patterns': analyze_message_bursts(your_messages),
        'preferred_length': 'lengthy' if your_messages['body'].str.len().mean() > 100 else 'concise'
    }
    
    return style_analysis

def analyze_message_bursts(messages):
    """Detect if you send multiple messages in quick succession"""
    messages = messages.sort_values('date_sent')
    
    bursts = []
    current_burst = []
    
    for i, (_, msg) in enumerate(messages.iterrows()):
        if i == 0:
            current_burst = [msg]
            continue
            
        time_diff = msg['date_sent'] - messages.iloc[i-1]['date_sent']
        
        # If less than 2 minutes apart, it's part of a burst
        if time_diff < 120000:  # 2 minutes in milliseconds
            current_burst.append(msg)
        else:
            if len(current_burst) > 1:
                bursts.append(current_burst)
            current_burst = [msg]
    
    return {
        'total_bursts': len(bursts),
        'avg_burst_size': sum(len(burst) for burst in bursts) / len(bursts) if bursts else 1,
        'burst_frequency': len(bursts) / len(messages) if len(messages) > 0 else 0
    }


In [None]:
def create_sequence_aware_training_data(messages_df, your_style):
    training_data = []
    
    for thread_id in messages_df['thread_id'].unique():
        thread_messages = messages_df[messages_df['thread_id'] == thread_id].sort_values('date_sent')
        
        # Group messages into conversation turns
        conversation_turns = group_into_turns(thread_messages)
        
        for i in range(len(conversation_turns) - 1):
            current_turn = conversation_turns[i]
            next_turn = conversation_turns[i + 1]
            
            # If you're a burst texter, preserve the sequence
            if your_style['burst_patterns']['burst_frequency'] > 0.3:  # High burst frequency
                input_text = format_conversation_context(current_turn)
                output_text = format_burst_response(next_turn)
            else:
                # For lengthy texters, preserve full message content
                input_text = current_turn[-1]['body']  # Last message in turn
                output_text = next_turn[0]['body']     # First response
            
            training_data.append({
                "instruction": "Continue this conversation in your natural style",
                "input": input_text,
                "output": output_text,
                "style_metadata": {
                    "response_type": "burst_sequence" if len(next_turn) > 1 else "single_message",
                    "message_count": len(next_turn),
                    "total_length": sum(len(msg['body']) for msg in next_turn),
                    "timing_pattern": analyze_turn_timing(next_turn)
                }
            })
    
    return training_data

def group_into_turns(messages):
    """Group consecutive messages from same sender into conversation turns"""
    turns = []
    current_turn = []
    current_sender = None
    
    for _, msg in messages.iterrows():
        if msg['from_recipient_id'] != current_sender:
            if current_turn:
                turns.append(current_turn)
            current_turn = [msg]
            current_sender = msg['from_recipient_id']
        else:
            # Check if messages are close enough in time to be same "turn"
            if current_turn and (msg['date_sent'] - current_turn[-1]['date_sent']) < 300000:  # 5 minutes
                current_turn.append(msg)
            else:
                if current_turn:
                    turns.append(current_turn)
                current_turn = [msg]
    
    if current_turn:
        turns.append(current_turn)
    
    return turns

def format_conversation_context(message_turn):
    """Format context from previous turn"""
    if len(message_turn) == 1:
        return message_turn[0]['body']
    
    # For multiple messages, join with context markers
    messages = [msg['body'] for msg in message_turn]
    return " <THEN> ".join(messages)

def format_burst_response(message_turn):
    """Format multiple messages as a sequence"""
    if len(message_turn) == 1:
        return message_turn[0]['body']
    
    # For burst messages, join with special tokens
    messages = [msg['body'] for msg in message_turn]
    return " <CONTINUE> ".join(messages)

def analyze_turn_timing(message_turn):
    """Analyze timing patterns within a turn"""
    if len(message_turn) <= 1:
        return "single_message"
    
    intervals = []
    for i in range(1, len(message_turn)):
        time_diff = message_turn[i]['date_sent'] - message_turn[i-1]['date_sent']
        intervals.append(time_diff)
    
    avg_interval = sum(intervals) / len(intervals)
    
    if avg_interval < 30000:  # 30 seconds
        return "rapid_fire"
    elif avg_interval < 120000:  # 2 minutes
        return "quick_succession"
    else:
        return "spaced_out"


In [None]:
def add_reaction_context(messages_df):
    """Add emotional context from reactions"""
    reactions = pd.read_csv('/root/test/signal-flatfiles/reaction.csv')
    
    # Group reactions by message
    reaction_summary = reactions.groupby('message_id').agg({
        'emoji': lambda x: list(x),
        'author_id': 'count'
    }).rename(columns={'author_id': 'reaction_count'})
    
    # Add reaction data to messages
    messages_df = messages_df.merge(
        reaction_summary, 
        left_on='_id', 
        right_index=True, 
        how='left'
    )
    
    # Fill NaN values
    messages_df['emoji'] = messages_df['emoji'].apply(lambda x: [] if pd.isna(x) else x)
    messages_df['reaction_count'] = messages_df['reaction_count'].fillna(0)
    
    return messages_df

def add_group_context(messages_df):
    """Add group chat context"""
    try:
        groups = pd.read_csv('/root/test/signal-flatfiles/groups.csv')
        membership = pd.read_csv('/root/test/signal-flatfiles/group_membership.csv')
        threads = pd.read_csv('/root/test/signal-flatfiles/thread.csv')
        
        # Create group lookup
        group_lookup = {}
        for _, thread in threads.iterrows():
            thread_id = thread['_id']
            recipient_id = thread['recipient_id']
            
            # Check if this is a group
            group_info = groups[groups['recipient_id'] == recipient_id]
            if not group_info.empty:
                group_id = group_info.iloc[0]['_id']
                member_count = len(membership[membership['group_id'] == group_id])
                group_lookup[thread_id] = {
                    'is_group': True,
                    'member_count': member_count,
                    'group_name': group_info.iloc[0].get('title', 'Unknown Group')
                }
            else:
                group_lookup[thread_id] = {
                    'is_group': False,
                    'member_count': 2,
                    'group_name': 'Direct Message'
                }
        
        # Add group context to messages
        messages_df['is_group_chat'] = messages_df['thread_id'].map(
            lambda x: group_lookup.get(x, {}).get('is_group', False)
        )
        messages_df['member_count'] = messages_df['thread_id'].map(
            lambda x: group_lookup.get(x, {}).get('member_count', 2)
        )
        
    except Exception as e:
        print(f"Warning: Could not load group data: {e}")
        messages_df['is_group_chat'] = False
        messages_df['member_count'] = 2
    
    return messages_df

def add_temporal_context(messages_df):
    """Add time-based context"""
    # Convert timestamps
    messages_df['datetime'] = pd.to_datetime(messages_df['date_sent'], unit='ms')
    
    # Extract temporal features
    messages_df['hour'] = messages_df['datetime'].dt.hour
    messages_df['day_of_week'] = messages_df['datetime'].dt.day_name()
    messages_df['time_period'] = messages_df['hour'].apply(get_time_period)
    
    # Calculate response timing
    messages_df = messages_df.sort_values(['thread_id', 'date_sent'])
    messages_df['response_delay'] = messages_df.groupby('thread_id')['date_sent'].diff() / 1000  # Convert to seconds
    
    return messages_df

def get_time_period(hour):
    """Classify time of day"""
    if 6 <= hour < 12: 
        return 'morning'
    elif 12 <= hour < 17: 
        return 'afternoon'
    elif 17 <= hour < 21: 
        return 'evening'
    else: 
        return 'night'

def classify_emotion_from_reactions(emoji_list):
    """Simple emotion classification from reactions"""
    if not emoji_list:
        return 'neutral'
    
    positive_emojis = ['❤️', '😍', '😊', '😂', '👍', '🔥', '💯']
    negative_emojis = ['😢', '😡', '👎', '💔']
    
    positive_count = sum(1 for emoji in emoji_list if emoji in positive_emojis)
    negative_count = sum(1 for emoji in emoji_list if emoji in negative_emojis)
    
    if positive_count > negative_count:
        return 'positive'
    elif negative_count > positive_count:
        return 'negative'
    else:
        return 'neutral'


In [None]:
import re
import requests
from urllib.parse import urlparse
import time
from bs4 import BeautifulSoup
import json

def extract_twitter_links(messages_df):
    """Extract all Twitter/X links from messages"""
    # Patterns for Twitter/X URLs
    twitter_patterns = [
        r'https?://(?:www\.)?twitter\.com/\S+',
        r'https?://(?:www\.)?x\.com/\S+',
        r'https?://t\.co/\S+',  # Twitter's URL shortener
    ]
    
    twitter_links = []
    
    for _, msg in messages_df.iterrows():
        if pd.notna(msg['body']):
            for pattern in twitter_patterns:
                matches = re.findall(pattern, msg['body'])
                for match in matches:
                    twitter_links.append({
                        'message_id': msg['_id'],
                        'thread_id': msg['thread_id'],
                        'sender_id': msg['from_recipient_id'],
                        'url': match,
                        'message_body': msg['body'],
                        'timestamp': msg['date_sent']
                    })
    
    return pd.DataFrame(twitter_links)

def get_tweet_content_simple(url):
    """
    Simple method to extract tweet content from URL
    Note: This is a basic approach - for production use, consider Twitter API
    """
    try:
        # Clean up the URL
        if 't.co' in url:
            # For t.co links, we'd need to follow redirects
            response = requests.head(url, allow_redirects=True, timeout=10)
            url = response.url
        
        # Convert x.com to twitter.com for better compatibility
        url = url.replace('x.com', 'twitter.com')
        
        # Extract tweet ID from URL
        tweet_id_match = re.search(r'/status/(\d+)', url)
        if not tweet_id_match:
            return None
        
        tweet_id = tweet_id_match.group(1)
        
        # Try to get basic info from URL structure
        username_match = re.search(r'twitter\.com/([^/]+)/', url)
        username = username_match.group(1) if username_match else 'unknown'
        
        return {
            'tweet_id': tweet_id,
            'username': username,
            'url': url,
            'extraction_method': 'url_parsing'
        }
        
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return None

def get_tweet_content_nitter(url):
    """
    Alternative method using Nitter instances (privacy-focused Twitter frontend)
    """
    try:
        # Convert to nitter URL
        nitter_instances = [
            'nitter.net',
            'nitter.it', 
            'nitter.unixfox.eu'
        ]
        
        # Extract path from original URL
        parsed = urlparse(url)
        path = parsed.path
        
        for instance in nitter_instances:
            try:
                nitter_url = f"https://{instance}{path}"
                response = requests.get(nitter_url, timeout=10, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Extract tweet content
                    tweet_text_elem = soup.find('div', class_='tweet-content')
                    tweet_text = tweet_text_elem.get_text().strip() if tweet_text_elem else ''
                    
                    # Extract username
                    username_elem = soup.find('a', class_='username')
                    username = username_elem.get_text().strip() if username_elem else 'unknown'
                    
                    # Extract timestamp
                    time_elem = soup.find('span', class_='tweet-date')
                    timestamp = time_elem.get_text().strip() if time_elem else ''
                    
                    return {
                        'tweet_text': tweet_text,
                        'username': username,
                        'timestamp': timestamp,
                        'url': url,
                        'nitter_url': nitter_url,
                        'extraction_method': 'nitter_scraping'
                    }
                    
            except Exception as e:
                continue
        
        return None
        
    except Exception as e:
        print(f"Error with Nitter extraction for {url}: {e}")
        return None

def analyze_twitter_sharing_patterns(messages_df, twitter_links_df):
    """Analyze patterns in Twitter link sharing"""
    if twitter_links_df.empty:
        return {}
    
    # Merge with message data for analysis
    analysis = {
        'total_twitter_links': len(twitter_links_df),
        'unique_threads_with_links': twitter_links_df['thread_id'].nunique(),
        'links_per_thread': twitter_links_df.groupby('thread_id').size().describe(),
        'top_sharers': twitter_links_df['sender_id'].value_counts().head(10),
        'sharing_frequency_by_time': analyze_twitter_timing(twitter_links_df),
        'link_context_analysis': analyze_link_context(messages_df, twitter_links_df)
    }
    
    return analysis

def analyze_twitter_timing(twitter_links_df):
    """Analyze when Twitter links are most commonly shared"""
    twitter_links_df['datetime'] = pd.to_datetime(twitter_links_df['timestamp'], unit='ms')
    twitter_links_df['hour'] = twitter_links_df['datetime'].dt.hour
    twitter_links_df['day_of_week'] = twitter_links_df['datetime'].dt.day_name()
    
    return {
        'by_hour': twitter_links_df['hour'].value_counts().sort_index(),
        'by_day': twitter_links_df['day_of_week'].value_counts(),
        'peak_sharing_time': twitter_links_df['hour'].mode().iloc[0] if not twitter_links_df.empty else None
    }

def analyze_link_context(messages_df, twitter_links_df):
    """Analyze the context around Twitter link sharing"""
    context_analysis = []
    
    for _, link_msg in twitter_links_df.iterrows():
        thread_id = link_msg['thread_id']
        msg_timestamp = link_msg['timestamp']
        
        # Get surrounding messages (before and after)
        thread_messages = messages_df[messages_df['thread_id'] == thread_id].sort_values('date_sent')
        
        # Find the position of this message
        msg_idx = thread_messages[thread_messages['_id'] == link_msg['message_id']].index
        if len(msg_idx) > 0:
            msg_position = thread_messages.index.get_loc(msg_idx[0])
            
            # Get context (2 messages before and after)
            start_idx = max(0, msg_position - 2)
            end_idx = min(len(thread_messages), msg_position + 3)
            context_messages = thread_messages.iloc[start_idx:end_idx]
            
            context_analysis.append({
                'link_message_id': link_msg['message_id'],
                'url': link_msg['url'],
                'context_messages': [
                    {
                        'body': msg['body'],
                        'sender_id': msg['from_recipient_id'],
                        'timestamp': msg['date_sent'],
                        'is_link_message': msg['_id'] == link_msg['message_id']
                    }
                    for _, msg in context_messages.iterrows()
                    if pd.notna(msg['body'])
                ]
            })
    
    return context_analysis


In [None]:
def create_twitter_enhanced_training_data(messages_df, twitter_links_df, extract_content=True):
    """
    Create training data that includes Twitter content context
    """
    enhanced_training_data = []
    
    # First, try to extract content from Twitter links if requested
    if extract_content and not twitter_links_df.empty:
        print(f"Attempting to extract content from {len(twitter_links_df)} Twitter links...")
        
        twitter_content = {}
        for _, link in twitter_links_df.iterrows():
            url = link['url']
            
            # Try Nitter first (more likely to work)
            content = get_tweet_content_nitter(url)
            if not content:
                # Fallback to simple URL parsing
                content = get_tweet_content_simple(url)
            
            if content:
                twitter_content[link['message_id']] = content
            
            # Be respectful with requests
            time.sleep(0.5)
    
    # Create enhanced training examples
    for _, link_msg in twitter_links_df.iterrows():
        thread_id = link_msg['thread_id']
        
        # Get conversation context around the link
        thread_messages = messages_df[messages_df['thread_id'] == thread_id].sort_values('date_sent')
        
        # Find messages around the Twitter link
        link_msg_full = thread_messages[thread_messages['_id'] == link_msg['message_id']]
        if link_msg_full.empty:
            continue
            
        msg_position = thread_messages.index.get_loc(link_msg_full.index[0])
        
        # Get context before the link
        context_before = []
        for i in range(max(0, msg_position - 3), msg_position):
            msg = thread_messages.iloc[i]
            if pd.notna(msg['body']):
                context_before.append(msg['body'])
        
        # Get responses after the link
        responses_after = []
        for i in range(msg_position + 1, min(len(thread_messages), msg_position + 4)):
            msg = thread_messages.iloc[i]
            if pd.notna(msg['body']):
                responses_after.append({
                    'text': msg['body'],
                    'sender_id': msg['from_recipient_id'],
                    'timestamp': msg['date_sent']
                })
        
        # Build the training example
        link_message = link_msg['message_body']
        twitter_url = link_msg['url']
        
        # Add extracted Twitter content if available
        twitter_info = twitter_content.get(link_msg['message_id'], {})
        
        # Create different types of training examples
        
        # 1. Context + Link sharing
        if context_before:
            context_text = " ".join(context_before[-2:])  # Last 2 messages
            enhanced_training_data.append({
                "instruction": "Continue this conversation by sharing a relevant link",
                "input": context_text,
                "output": link_message,
                "metadata": {
                    "type": "link_sharing",
                    "url": twitter_url,
                    "twitter_content": twitter_info,
                    "context_length": len(context_before)
                }
            })
        
        # 2. Link + Response patterns
        if responses_after:
            for response in responses_after[:2]:  # First 2 responses
                enhanced_training_data.append({
                    "instruction": "Respond to this shared link appropriately",
                    "input": f"Shared link: {link_message}",
                    "output": response['text'],
                    "metadata": {
                        "type": "link_response",
                        "url": twitter_url,
                        "twitter_content": twitter_info,
                        "response_sender": response['sender_id']
                    }
                })
        
        # 3. If we have Twitter content, create content-aware examples
        if twitter_info and 'tweet_text' in twitter_info:
            tweet_text = twitter_info['tweet_text']
            
            # Context + Tweet content + Your sharing style
            if context_before:
                context_text = " ".join(context_before[-2:])
                enhanced_training_data.append({
                    "instruction": "Share a relevant tweet in response to this conversation",
                    "input": f"Conversation: {context_text}\nTweet content: {tweet_text}",
                    "output": link_message,
                    "metadata": {
                        "type": "content_aware_sharing",
                        "url": twitter_url,
                        "tweet_content": tweet_text,
                        "username": twitter_info.get('username', 'unknown')
                    }
                })
            
            # Tweet content + Response patterns
            if responses_after:
                for response in responses_after[:1]:  # First response
                    enhanced_training_data.append({
                        "instruction": "Respond to this tweet content",
                        "input": f"Tweet: {tweet_text}\nShared by: {link_message}",
                        "output": response['text'],
                        "metadata": {
                            "type": "tweet_content_response",
                            "url": twitter_url,
                            "tweet_content": tweet_text,
                            "username": twitter_info.get('username', 'unknown')
                        }
                    })
    
    return enhanced_training_data

def analyze_twitter_conversation_patterns(enhanced_training_data):
    """Analyze patterns in how Twitter links are used in conversations"""
    if not enhanced_training_data:
        return {}
    
    patterns = {
        'total_examples': len(enhanced_training_data),
        'by_type': {},
        'content_extraction_success': 0,
        'common_sharing_contexts': [],
        'response_patterns': []
    }
    
    for example in enhanced_training_data:
        example_type = example['metadata']['type']
        patterns['by_type'][example_type] = patterns['by_type'].get(example_type, 0) + 1
        
        if example['metadata'].get('twitter_content'):
            patterns['content_extraction_success'] += 1
        
        # Analyze sharing contexts
        if example_type == 'link_sharing':
            patterns['common_sharing_contexts'].append(example['input'][:100])
        
        # Analyze response patterns
        if example_type in ['link_response', 'tweet_content_response']:
            patterns['response_patterns'].append(example['output'][:100])
    
    return patterns

def save_twitter_enhanced_dataset(enhanced_training_data, filename='twitter_enhanced_training.json'):
    """Save the enhanced training data to a file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(enhanced_training_data, f, indent=2, ensure_ascii=False)
    
    print(f"Saved {len(enhanced_training_data)} enhanced training examples to {filename}")
    return filename


In [None]:
# Load your messages data
messages_df = pd.read_csv('/root/test/signal-flatfiles/signal.csv')

# Extract all Twitter links from your conversations
print("Extracting Twitter links from messages...")
twitter_links_df = extract_twitter_links(messages_df)

print(f"Found {len(twitter_links_df)} Twitter links in your conversations")

if not twitter_links_df.empty:
    # Analyze sharing patterns
    print("\nAnalyzing Twitter sharing patterns...")
    sharing_patterns = analyze_twitter_sharing_patterns(messages_df, twitter_links_df)
    
    print(f"Total Twitter links: {sharing_patterns['total_twitter_links']}")
    print(f"Threads with links: {sharing_patterns['unique_threads_with_links']}")
    print(f"Peak sharing time: {sharing_patterns['sharing_frequency_by_time']['peak_sharing_time']}:00")
    
    # Show some example links
    print(f"\nFirst 5 Twitter links found:")
    for i, (_, link) in enumerate(twitter_links_df.head().iterrows()):
        print(f"{i+1}. {link['url']}")
        print(f"   Message: {link['message_body'][:100]}...")
        print()
else:
    print("No Twitter links found in your conversations.")


In [None]:
# Create enhanced training data with Twitter content
# Note: This will attempt to extract actual tweet content, which may take some time

if not twitter_links_df.empty:
    print("Creating Twitter-enhanced training data...")
    print("This may take a few minutes as we extract tweet content...")
    
    # Create enhanced training data (set extract_content=False for faster processing without content extraction)
    enhanced_data = create_twitter_enhanced_training_data(
        messages_df, 
        twitter_links_df, 
        extract_content=True  # Set to False if you want to skip content extraction
    )
    
    print(f"\nCreated {len(enhanced_data)} enhanced training examples")
    
    # Analyze the patterns
    patterns = analyze_twitter_conversation_patterns(enhanced_data)
    print(f"\nTraining data breakdown:")
    for data_type, count in patterns['by_type'].items():
        print(f"  {data_type}: {count} examples")
    
    print(f"\nSuccessfully extracted content from {patterns['content_extraction_success']} links")
    
    # Save the enhanced dataset
    filename = save_twitter_enhanced_dataset(enhanced_data)
    print(f"Dataset saved as: {filename}")
    
    # Show a few examples
    print(f"\nExample training data:")
    for i, example in enumerate(enhanced_data[:3]):
        print(f"\nExample {i+1} ({example['metadata']['type']}):")
        print(f"Instruction: {example['instruction']}")
        print(f"Input: {example['input'][:150]}...")
        print(f"Output: {example['output'][:150]}...")
        if example['metadata'].get('tweet_content'):
            print(f"Tweet content available: Yes")
        print("-" * 50)

else:
    print("No Twitter links found - skipping enhanced training data creation")


In [None]:
# Install additional dependencies for Twitter content extraction
%pip install beautifulsoup4 requests


In [None]:
def create_enhanced_training_data(your_recipient_id=2):
    """Create comprehensive training data with all metadata"""
    
    # Load and enhance data
    print("Loading data...")
    messages = pd.read_csv('/root/test/signal-flatfiles/signal.csv')
    
    # Filter for text messages with content
    text_messages = messages[
        (messages['body'].notna()) & 
        (messages['body'].str.len() > 5)
    ].copy()
    
    print("Analyzing your communication style...")
    your_style = analyze_personal_texting_style(text_messages, your_recipient_id)
    
    print("Adding metadata...")
    # Add all metadata
    text_messages = add_reaction_context(text_messages)
    text_messages = add_group_context(text_messages)
    text_messages = add_temporal_context(text_messages)
    
    print("Creating training examples...")
    enhanced_data = []
    
    # Group by thread and create enhanced examples
    for thread_id in text_messages['thread_id'].unique():
        thread_messages = text_messages[
            text_messages['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_messages) < 3:
            continue
        
        # Create conversation turns
        conversation_turns = group_into_turns(thread_messages)
        
        for i in range(len(conversation_turns) - 1):
            current_turn = conversation_turns[i]
            next_turn = conversation_turns[i + 1]
            
            # Skip if next turn isn't from you
            if next_turn[0]['from_recipient_id'] != your_recipient_id:
                continue
            
            # Format input and output based on style
            if your_style['burst_patterns']['burst_frequency'] > 0.3:
                input_text = format_conversation_context(current_turn)
                output_text = format_burst_response(next_turn)
            else:
                input_text = current_turn[-1]['body']
                output_text = next_turn[0]['body']
            
            # Get metadata from the response message
            response_msg = next_turn[0]
            
            training_example = {
                "instruction": get_style_instruction(your_style),
                "input": input_text,
                "output": output_text,
                "metadata": {
                    # Style metadata
                    "response_type": "burst_sequence" if len(next_turn) > 1 else "single_message",
                    "message_count": len(next_turn),
                    "total_length": sum(len(msg['body']) for msg in next_turn),
                    "timing_pattern": analyze_turn_timing(next_turn),
                    
                    # Context metadata
                    "conversation_type": "group_chat" if response_msg['is_group_chat'] else "direct_message",
                    "member_count": response_msg['member_count'],
                    "time_period": response_msg['time_period'],
                    "day_of_week": response_msg['day_of_week'],
                    
                    # Emotional metadata
                    "emotional_context": classify_emotion_from_reactions(response_msg.get('emoji', [])),
                    "reaction_count": response_msg.get('reaction_count', 0),
                    
                    # Response timing
                    "response_delay": response_msg.get('response_delay', 0),
                    "urgency": classify_urgency(response_msg.get('response_delay', 0))
                }
            }
            
            enhanced_data.append(training_example)
    
    print(f"Created {len(enhanced_data)} enhanced training examples")
    return enhanced_data, your_style

def get_style_instruction(style_analysis):
    """Generate style-appropriate instruction"""
    if style_analysis['burst_patterns']['burst_frequency'] > 0.3:
        return "Respond naturally in your communication style, using multiple messages if needed to express your thoughts"
    elif style_analysis['avg_message_length'] > 150:
        return "Respond with a detailed, comprehensive message that fully explores the topic"
    else:
        return "Respond naturally in your typical communication style"

def classify_urgency(response_delay_seconds):
    """Classify response urgency based on delay"""
    if pd.isna(response_delay_seconds) or response_delay_seconds < 60:
        return "immediate"
    elif response_delay_seconds < 3600:  # 1 hour
        return "quick"
    elif response_delay_seconds < 86400:  # 1 day
        return "delayed"
    else:
        return "long_delay"

def adaptive_quality_filter(training_data, personal_style):
    """Filter training data based on personal style"""
    filtered_data = []
    
    for example in training_data:
        metadata = example['metadata']
        
        # Adjust quality criteria based on style
        if personal_style['preferred_length'] == 'lengthy':
            min_length = 20
            max_length = 2000  # Higher threshold for lengthy texters
        else:
            min_length = 10
            max_length = 500
        
        # Quality checks
        input_len = len(example['input'])
        output_len = len(example['output'])
        
        if (min_length <= output_len <= max_length and
            input_len >= 5 and
            metadata['emotional_context'] != 'negative' and
            metadata['urgency'] != 'long_delay'):
            
            filtered_data.append(example)
    
    return filtered_data


In [None]:
# Create enhanced training data
enhanced_training_data, your_communication_style = create_enhanced_training_data(your_recipient_id=2)

# Apply quality filtering
filtered_training_data = adaptive_quality_filter(enhanced_training_data, your_communication_style)

print(f"\\nYour Communication Style Analysis:")
print(f"Average message length: {your_communication_style['avg_message_length']:.1f} characters")
print(f"Preferred length: {your_communication_style['preferred_length']}")
print(f"Burst frequency: {your_communication_style['burst_patterns']['burst_frequency']:.2f}")
print(f"Average burst size: {your_communication_style['burst_patterns']['avg_burst_size']:.1f} messages")

print(f"\\nTraining Data Summary:")
print(f"Total enhanced examples: {len(enhanced_training_data)}")
print(f"After quality filtering: {len(filtered_training_data)}")

# Show sample examples
print(f"\\nSample Training Examples:")
for i, example in enumerate(filtered_training_data[:3]):
    print(f"\\nExample {i+1}:")
    print(f"Input: {example['input'][:100]}...")
    print(f"Output: {example['output'][:100]}...")
    print(f"Style: {example['metadata']['response_type']}, {example['metadata']['timing_pattern']}")
    print(f"Context: {example['metadata']['conversation_type']}, {example['metadata']['time_period']}")


In [None]:
from datasets import Dataset

def format_for_training(training_data):
    """Format training data for Unsloth"""
    
    formatted_data = []
    
    for example in training_data:
        # Create the conversation format
        conversation = [
            {"role": "system", "content": "You are an AI assistant that responds in the user's natural communication style."},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]}
        ]
        
        # Apply chat template
        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        
        formatted_data.append({"text": text})
    
    return Dataset.from_list(formatted_data)

# Format the data
print("Formatting data for training...")
training_dataset = format_for_training(filtered_training_data)

print(f"Training dataset size: {len(training_dataset)}")
print(f"Sample formatted text:")
print(training_dataset[0]["text"][:500] + "...")


In [None]:
def handle_blocked_contacts(messages_df, recipients_df):
    """
    Include conversations with blocked contacts but add relationship context
    """
    # Identify blocked contacts
    blocked_contacts = recipients_df[recipients_df['blocked'] == 1]['_id'].tolist()
    
    print(f"Found {len(blocked_contacts)} blocked contacts")
    
    # Add blocking status to messages
    messages_df['sender_blocked'] = messages_df['from_recipient_id'].isin(blocked_contacts)
    messages_df['recipient_blocked'] = messages_df['to_recipient_id'].isin(blocked_contacts)
    messages_df['involves_blocked_contact'] = messages_df['sender_blocked'] | messages_df['recipient_blocked']
    
    # Get conversation stats
    total_messages = len(messages_df)
    blocked_messages = len(messages_df[messages_df['involves_blocked_contact']])
    
    print(f"Total messages: {total_messages}")
    print(f"Messages involving blocked contacts: {blocked_messages} ({blocked_messages/total_messages*100:.1f}%)")
    
    return messages_df

def add_relationship_context(training_example, involves_blocked=False, sender_blocked=False, recipient_blocked=False):
    """
    Add relationship context to training examples
    """
    context_notes = []
    
    if involves_blocked:
        if sender_blocked:
            context_notes.append("Note: This conversation involved a contact that was later blocked.")
        elif recipient_blocked:
            context_notes.append("Note: This conversation was with a contact that was later blocked.")
    
    # Add context to the training example
    if context_notes:
        training_example['metadata'] = training_example.get('metadata', {})
        training_example['metadata']['relationship_context'] = context_notes
        
        # Optionally add to the instruction for more explicit context
        original_instruction = training_example['instruction']
        context_prefix = " ".join(context_notes) + " "
        training_example['instruction'] = context_prefix + original_instruction
    
    return training_example

def create_inclusive_training_data(your_recipient_id=2):
    """
    Create training data that includes ALL conversations, including blocked contacts
    """
    print("Loading data...")
    messages = pd.read_csv('/root/test/signal-flatfiles/signal.csv')
    recipients = pd.read_csv('/root/test/signal-flatfiles/recipient.csv')
    
    # Filter for text messages with content
    text_messages = messages[
        (messages['body'].notna()) & 
        (messages['body'].str.len() > 5)
    ].copy()
    
    print("Handling blocked contacts...")
    text_messages = handle_blocked_contacts(text_messages, recipients)
    
    print("Creating training examples...")
    training_data = []
    
    # Group by thread and create conversations
    for thread_id in text_messages['thread_id'].unique():
        thread_messages = text_messages[
            text_messages['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_messages) < 2:
            continue
            
        # Create conversation pairs
        for i in range(len(thread_messages) - 1):
            current_msg = thread_messages.iloc[i]
            next_msg = thread_messages.iloc[i + 1]
            
            # Only create training examples where you're responding
            if next_msg['from_recipient_id'] == your_recipient_id:
                
                # Build context from recent messages
                context_start = max(0, i - 3)  # Include up to 3 previous messages
                context_messages = thread_messages.iloc[context_start:i+1]
                
                # Format the conversation context
                conversation_context = []
                for _, msg in context_messages.iterrows():
                    sender_name = "You" if msg['from_recipient_id'] == your_recipient_id else "Other"
                    conversation_context.append(f"{sender_name}: {msg['body']}")
                
                # Create training example
                training_example = {
                    'instruction': "\\n".join(conversation_context),
                    'response': next_msg['body'],
                    'thread_id': thread_id,
                    'timestamp': next_msg['date_sent']
                }
                
                # Add relationship context if needed
                involves_blocked = current_msg['involves_blocked_contact'] or next_msg['involves_blocked_contact']
                sender_blocked = current_msg['sender_blocked'] or next_msg['sender_blocked'] 
                recipient_blocked = current_msg['recipient_blocked'] or next_msg['recipient_blocked']
                
                training_example = add_relationship_context(
                    training_example, 
                    involves_blocked=involves_blocked,
                    sender_blocked=sender_blocked, 
                    recipient_blocked=recipient_blocked
                )
                
                training_data.append(training_example)
    
    print(f"Created {len(training_data)} training examples")
    
    # Show breakdown by relationship status
    blocked_examples = [ex for ex in training_data if ex.get('metadata', {}).get('relationship_context')]
    print(f"Examples involving blocked contacts: {len(blocked_examples)} ({len(blocked_examples)/len(training_data)*100:.1f}%)")
    
    return training_data


In [None]:
# Create comprehensive training data including blocked contacts
inclusive_training_data = create_inclusive_training_data(your_recipient_id=2)

print(f"\\nSample training example with blocked contact context:")
blocked_examples = [ex for ex in inclusive_training_data if ex.get('metadata', {}).get('relationship_context')]
if blocked_examples:
    example = blocked_examples[0]
    print(f"Instruction: {example['instruction'][:200]}...")
    print(f"Response: {example['response'][:100]}...")
    print(f"Metadata: {example.get('metadata', {})}")

print(f"\\nTraining data breakdown:")
print(f"Total examples: {len(inclusive_training_data)}")
print(f"Examples with blocked contacts: {len(blocked_examples)}")
print(f"Regular examples: {len(inclusive_training_data) - len(blocked_examples)}")

# Optional: Save to file for later use
import json
with open('inclusive_training_data.json', 'w') as f:
    json.dump(inclusive_training_data, f, indent=2)
    
print(f"\\nSaved training data to 'inclusive_training_data.json'")


In [None]:
def analyze_all_communication_styles(messages_df, recipients_df, min_messages=50):
    """
    Analyze communication styles for all frequent contacts
    """
    # Get recipient names for better readability
    recipient_lookup = recipients_df.set_index('_id')['profile_given_name'].fillna('Unknown').to_dict()
    
    communication_styles = {}
    
    # Analyze each frequent contact
    frequent_contacts = messages_df['from_recipient_id'].value_counts()
    frequent_contacts = frequent_contacts[frequent_contacts >= min_messages]
    
    print(f"Analyzing communication styles for {len(frequent_contacts)} frequent contacts...")
    
    for recipient_id in frequent_contacts.index:
        contact_messages = messages_df[messages_df['from_recipient_id'] == recipient_id]
        
        # Analyze their style
        style = {
            'name': recipient_lookup.get(recipient_id, f'Contact_{recipient_id}'),
            'total_messages': len(contact_messages),
            'avg_message_length': contact_messages['body'].str.len().mean(),
            'message_length_std': contact_messages['body'].str.len().std(),
            'burst_patterns': analyze_message_bursts(contact_messages),
            'preferred_times': analyze_timing_patterns(contact_messages),
            'emoji_usage': analyze_emoji_usage(contact_messages),
            'response_speed': analyze_response_patterns(contact_messages, messages_df)
        }
        
        # Classify communication style
        style['style_type'] = classify_communication_style(style)
        
        communication_styles[recipient_id] = style
    
    return communication_styles

def classify_communication_style(style_data):
    """
    Classify someone's communication style based on their patterns
    """
    avg_length = style_data['avg_message_length']
    burst_freq = style_data['burst_patterns']['burst_frequency']
    avg_burst_size = style_data['burst_patterns']['avg_burst_size']
    
    if burst_freq > 0.4 and avg_burst_size > 3:
        if avg_length < 50:
            return "rapid_burst_chatter"  # Many short messages in quick succession
        else:
            return "verbose_burst_chatter"  # Multiple longer messages in succession
    elif avg_length > 200:
        return "lengthy_texter"  # Long, detailed messages
    elif avg_length < 30:
        return "concise_texter"  # Short, to-the-point messages
    elif burst_freq > 0.2:
        return "moderate_burst_chatter"  # Some bursting behavior
    else:
        return "balanced_communicator"  # Balanced approach

def analyze_emoji_usage(messages):
    """
    Analyze emoji usage patterns
    """
    import re
    
    emoji_pattern = re.compile(r'[\\U0001F600-\\U0001F64F\\U0001F300-\\U0001F5FF\\U0001F680-\\U0001F6FF\\U0001F1E0-\\U0001F1FF\\U00002702-\\U000027B0\\U000024C2-\\U0001F251]+')
    
    total_messages = len(messages)
    messages_with_emojis = messages['body'].str.contains(emoji_pattern, regex=True, na=False).sum()
    
    return {
        'emoji_frequency': messages_with_emojis / total_messages if total_messages > 0 else 0,
        'messages_with_emojis': messages_with_emojis,
        'total_messages': total_messages
    }

def analyze_timing_patterns(messages):
    """
    Analyze when someone typically sends messages
    """
    messages['hour'] = pd.to_datetime(messages['date_sent'], unit='ms').dt.hour
    
    hour_distribution = messages['hour'].value_counts().sort_index()
    peak_hours = hour_distribution.nlargest(3).index.tolist()
    
    return {
        'peak_hours': peak_hours,
        'hour_distribution': hour_distribution.to_dict(),
        'night_owl': hour_distribution[22:].sum() + hour_distribution[:6].sum() > len(messages) * 0.3,
        'early_bird': hour_distribution[6:10].sum() > len(messages) * 0.3
    }

def analyze_response_patterns(contact_messages, all_messages):
    """
    Analyze how quickly someone responds
    """
    # This is a simplified version - you could make it more sophisticated
    response_times = []
    
    for thread_id in contact_messages['thread_id'].unique():
        thread_msgs = all_messages[all_messages['thread_id'] == thread_id].sort_values('date_sent')
        
        for i in range(len(thread_msgs) - 1):
            current_msg = thread_msgs.iloc[i]
            next_msg = thread_msgs.iloc[i + 1]
            
            # If this contact is responding to someone else
            if (current_msg['from_recipient_id'] != contact_messages['from_recipient_id'].iloc[0] and 
                next_msg['from_recipient_id'] == contact_messages['from_recipient_id'].iloc[0]):
                
                response_time = next_msg['date_sent'] - current_msg['date_sent']
                response_times.append(response_time)
    
    if response_times:
        avg_response_time = np.mean(response_times) / (1000 * 60)  # Convert to minutes
        return {
            'avg_response_time_minutes': avg_response_time,
            'quick_responder': avg_response_time < 30,  # Responds within 30 minutes on average
            'total_responses_analyzed': len(response_times)
        }
    else:
        return {
            'avg_response_time_minutes': None,
            'quick_responder': False,
            'total_responses_analyzed': 0
        }


In [None]:
def create_adaptive_training_data(messages_df, recipients_df, communication_styles, your_recipient_id=2):
    """
    Create training data that captures how you adapt to different communication styles
    """
    training_data = []
    
    print("Creating adaptive training examples...")
    
    # Group by thread and create conversations
    for thread_id in messages_df['thread_id'].unique():
        thread_messages = messages_df[
            messages_df['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_messages) < 2:
            continue
            
        # Identify the other person in this conversation
        other_participants = thread_messages[
            thread_messages['from_recipient_id'] != your_recipient_id
        ]['from_recipient_id'].unique()
        
        if len(other_participants) != 1:  # Skip group chats for now
            continue
            
        other_person_id = other_participants[0]
        other_person_style = communication_styles.get(other_person_id, {})
        
        # Create conversation pairs
        for i in range(len(thread_messages) - 1):
            current_msg = thread_messages.iloc[i]
            next_msg = thread_messages.iloc[i + 1]
            
            # Only create training examples where you're responding
            if next_msg['from_recipient_id'] == your_recipient_id:
                
                # Build context with style awareness
                context_start = max(0, i - 4)  # Include more context for style adaptation
                context_messages = thread_messages.iloc[context_start:i+1]
                
                # Format conversation with style indicators
                conversation_context = []
                for _, msg in context_messages.iterrows():
                    if msg['from_recipient_id'] == your_recipient_id:
                        sender_name = "You"
                    else:
                        sender_name = other_person_style.get('name', 'Other')
                        # Add style indicator for the other person's messages
                        style_type = other_person_style.get('style_type', 'unknown')
                        if style_type in ['rapid_burst_chatter', 'verbose_burst_chatter']:
                            sender_name += " (burst chatter)"
                        elif style_type == 'lengthy_texter':
                            sender_name += " (lengthy texter)"
                        elif style_type == 'concise_texter':
                            sender_name += " (concise texter)"\n                    \n                    conversation_context.append(f"{sender_name}: {msg['body']}")\n                \n                # Create enhanced training example\n                training_example = {\n                    'instruction': "\\n".join(conversation_context),\n                    'response': next_msg['body'],\n                    'thread_id': thread_id,\n                    'timestamp': next_msg['date_sent'],\n                    'other_person_style': other_person_style.get('style_type', 'unknown'),\n                    'other_person_name': other_person_style.get('name', 'Unknown'),\n                    'adaptation_context': create_adaptation_context(current_msg, next_msg, other_person_style)\n                }\n                \n                training_data.append(training_example)\n    \n    print(f"Created {len(training_data)} adaptive training examples")\n    \n    # Show breakdown by communication styles\n    style_breakdown = {}\n    for example in training_data:\n        style = example['other_person_style']\n        style_breakdown[style] = style_breakdown.get(style, 0) + 1\n    \n    print("\\nTraining examples by communication style:")\n    for style, count in sorted(style_breakdown.items(), key=lambda x: x[1], reverse=True):\n        print(f"  {style}: {count} examples ({count/len(training_data)*100:.1f}%)")\n    \n    return training_data\n\ndef create_adaptation_context(current_msg, your_response, other_person_style):\n    """Create context about how you're adapting to their communication style"""\n    adaptations = []\n    \n    other_style = other_person_style.get('style_type', 'unknown')\n    other_length = len(current_msg['body'])\n    your_length = len(your_response['body'])\n    \n    # Analyze length adaptation\n    if other_style == 'lengthy_texter' and your_length > 100:\n        adaptations.append("matching_lengthy_style")\n    elif other_style == 'concise_texter' and your_length < 50:\n        adaptations.append("matching_concise_style")\n    elif other_style in ['rapid_burst_chatter', 'verbose_burst_chatter']:\n        adaptations.append("responding_to_burst_chatter")\n    \n    # Analyze emoji adaptation\n    other_emoji_freq = other_person_style.get('emoji_usage', {}).get('emoji_frequency', 0)\n    your_has_emoji = bool(re.search(r'[\\U0001F600-\\U0001F64F\\U0001F300-\\U0001F5FF\\U0001F680-\\U0001F6FF\\U0001F1E0-\\U0001F1FF\\U00002702-\\U000027B0\\U000024C2-\\U0001F251]+', your_response['body']))\n    \n    if other_emoji_freq > 0.3 and your_has_emoji:\n        adaptations.append("matching_emoji_usage")\n    \n    return adaptations\n\ndef analyze_your_adaptation_patterns(training_data):\n    """Analyze how you adapt to different communication styles"""\n    adaptation_analysis = {}\n    \n    for example in training_data:\n        other_style = example['other_person_style']\n        your_response_length = len(example['response'])\n        adaptations = example['adaptation_context']\n        \n        if other_style not in adaptation_analysis:\n            adaptation_analysis[other_style] = {\n                'total_examples': 0,\n                'avg_response_length': [],\n                'adaptation_types': {},\n                'example_responses': []\n            }\n        \n        adaptation_analysis[other_style]['total_examples'] += 1\n        adaptation_analysis[other_style]['avg_response_length'].append(your_response_length)\n        \n        for adaptation in adaptations:\n            adaptation_analysis[other_style]['adaptation_types'][adaptation] = \\\n                adaptation_analysis[other_style]['adaptation_types'].get(adaptation, 0) + 1\n        \n        # Store some example responses\n        if len(adaptation_analysis[other_style]['example_responses']) < 3:\n            adaptation_analysis[other_style]['example_responses'].append(example['response'][:100])\n    \n    # Calculate averages\n    for style_data in adaptation_analysis.values():\n        if style_data['avg_response_length']:\n            style_data['avg_response_length'] = np.mean(style_data['avg_response_length'])\n        else:\n            style_data['avg_response_length'] = 0\n    \n    return adaptation_analysis


In [None]:
# Load data
messages = pd.read_csv('/root/test/signal-flatfiles/signal.csv')
recipients = pd.read_csv('/root/test/signal-flatfiles/recipient.csv')

# Filter for text messages
text_messages = messages[
    (messages['body'].notna()) & 
    (messages['body'].str.len() > 5)
].copy()

print("Analyzing communication styles for all contacts...")

# Analyze everyone's communication styles
all_communication_styles = analyze_all_communication_styles(text_messages, recipients, min_messages=30)

print(f"\\nCommunication Style Summary:")
print("="*50)

# Show the most interesting communicators
style_types = {}
for person_id, style_data in all_communication_styles.items():
    style_type = style_data['style_type']
    if style_type not in style_types:
        style_types[style_type] = []
    style_types[style_type].append((style_data['name'], style_data['total_messages'], style_data['avg_message_length']))

for style_type, people in style_types.items():
    print(f"\\n{style_type.replace('_', ' ').title()}:")
    for name, msg_count, avg_length in sorted(people, key=lambda x: x[1], reverse=True)[:3]:
        print(f"  • {name}: {msg_count} messages, avg {avg_length:.0f} chars")

# Create adaptive training data
print(f"\\nCreating adaptive training data...")
adaptive_training_data = create_adaptive_training_data(
    text_messages, recipients, all_communication_styles, your_recipient_id=2
)

# Analyze your adaptation patterns
print(f"\\nAnalyzing your adaptation patterns...")
adaptation_analysis = analyze_your_adaptation_patterns(adaptive_training_data)

print(f"\\nYour Adaptation Patterns:")
print("="*30)
for style, analysis in adaptation_analysis.items():
    if analysis['total_examples'] > 10:  # Only show styles with enough examples
        print(f"\\nWhen talking to {style.replace('_', ' ')}:")
        print(f"  • {analysis['total_examples']} conversations")
        print(f"  • Your avg response length: {analysis['avg_response_length']:.0f} chars")
        if analysis['adaptation_types']:
            print(f"  • Common adaptations: {', '.join(analysis['adaptation_types'].keys())}")
        if analysis['example_responses']:
            print(f"  • Example response: '{analysis['example_responses'][0]}...'")

print(f"\\nTotal adaptive training examples: {len(adaptive_training_data)}")


In [None]:
import pandas as pd
import json
import re
from datetime import datetime

def clean_message(text):
    if pd.isna(text):
        return None
    
    # Remove phone numbers, emails, etc.
    text = re.sub(r'\+?\d{10,}', '[PHONE]', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
    
    return text.strip()

def main():
    # Load data
    messages = pd.read_csv('signal-flatfiles/signal.csv')
    recipients = pd.read_csv('signal-flatfiles/recipient.csv')
    
    # Filter for text messages only
    text_messages = messages[
        (messages['type'] == 10485783) &  # Text message type
        (messages['body'].notna()) &
        (messages['body'].str.len() > 5)
    ].copy()
    
    # Clean messages
    text_messages['body'] = text_messages['body'].apply(clean_message)
    text_messages = text_messages[text_messages['body'].notna()]
    
    # Create training data
    training_data = []
    
    # Group by thread and create conversations
    for thread_id in text_messages['thread_id'].unique():
        thread_msgs = text_messages[
            text_messages['thread_id'] == thread_id
        ].sort_values('date_sent')
        
        if len(thread_msgs) < 3:
            continue
            
        # Create conversation windows
        for i in range(len(thread_msgs) - 2):
            context = thread_msgs.iloc[i]['body']
            user_msg = thread_msgs.iloc[i + 1]['body']
            response = thread_msgs.iloc[i + 2]['body']
            
            training_data.append({
                "instruction": "Continue this conversation naturally",
                "input": f"Context: {context}\nUser: {user_msg}",
                "output": response
            })
    
    # Save training data
    with open('signal_training_data.json', 'w') as f:
        json.dump(training_data, f, indent=2)
    
    print(f"Generated {len(training_data)} training examples")

if __name__ == "__main__":
    main()

# Fine Tuning THe Model

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)