In [None]:
import pandas as pd
import re
import os
import logging
from collections import defaultdict
from typing import List, Dict, Set, Tuple

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='chat_processing.log'
)

def read_file_list(config_path: str) -> List[str]:
    """
    Read the list of filenames from the configuration file.
    
    Args:
        config_path: Path to the configuration file
        
    Returns:
        List of filenames
    """
    try:
        with open(config_path, 'r', encoding='utf-8') as config_file:
            return config_file.read().splitlines()
    except FileNotFoundError:
        logging.error(f"Configuration file not found: {config_path}")
        return []
    except Exception as e:
        logging.error(f"Error reading configuration file: {e}")
        return []

def parse_chat_files(file_names: List[str], base_path: str = "../../data/") -> pd.DataFrame:
    """
    Parse multiple chat log files and combine the data.
    
    Args:
        file_names: List of files to process
        base_path: Base directory for the files
        
    Returns:
        DataFrame containing parsed chat data
    """
    pattern = r'\[(.*?)\] (.*?): (.*)'
    datalist = []
    
    for stream_count, file in enumerate(file_names):
        full_path = os.path.join(base_path, file)
        try:
            with open(full_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                for line in lines:
                    match = re.match(pattern, line.strip())
                    if match:
                        date, user, message = match.groups()
                        datalist.append([date, user, message, stream_count])
                    else:
                        logging.warning(f"Line doesn't match pattern: {line.strip()[:50]}...")
            logging.info(f"Successfully processed file: {file}")
        except FileNotFoundError:
            logging.error(f"File not found: {full_path}")
        except Exception as e:
            logging.error(f"Error processing file {full_path}: {e}")
    
    if not datalist:
        logging.warning("No data was parsed from the files")
        return pd.DataFrame(columns=["date", "user", "message", "stream"])
    
    return pd.DataFrame(datalist, columns=["date", "user", "message", "stream"])

def load_username_mappings(mapping_path: str = None) -> Dict[str, str]:
    """
    Load manual username mappings from a file or return default mappings.
    
    Args:
        mapping_path: Path to the username mapping file (optional)
        
    Returns:
        Dictionary mapping from original usernames to canonical usernames
    """
    default_mappings = {
        "Banties1g": "banties_x",
        "banties1g": "banties_x",
        "fyodor_m_d1821": "fyredoor4",
        "chili_poe": "chili_con_bacon",
        "Wirelesss_": "W1r3lesss",
        "treklul": "trek44_",
        "ttrek_": "trek44_",
        "TriplesingleJ": "TripleSingleJames",
        "uuccugr": "uwu_cougar"
    }
    
    if mapping_path and os.path.exists(mapping_path):
        try:
            mappings = {}
            with open(mapping_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if ':' in line:
                        original, canonical = line.strip().split(':', 1)
                        mappings[original.strip()] = canonical.strip()
            return mappings
        except Exception as e:
            logging.error(f"Error loading username mappings: {e}")
            return default_mappings
    
    return default_mappings

def find_username_variants(users: List[str]) -> Tuple[Dict[str, Set[str]], Dict[str, str]]:
    """
    Find username variants with different capitalizations and create a mapping.
    
    Args:
        users: List of usernames
        
    Returns:
        Tuple containing:
        - Dictionary mapping lowercase username to all variants
        - Dictionary mapping each variant to its canonical form
    """
    user_variants = defaultdict(set)
    for user in users:
        user_variants[user.lower()].add(user)
    
    # Find usernames with different capitalization
    duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}
    
    # Create a mapping from all variants to the canonical (sorted first) variant
    variant_map = {}
    for variants in duplicate_users.values():
        sorted_variants = sorted(variants)
        canonical = sorted_variants[0]
        for v in variants:
            variant_map[v] = canonical
    
    return duplicate_users, variant_map

def replace_mentions(msg: str, variant_map: Dict[str, str]) -> str:
    """
    Replace username mentions in messages with their canonical forms.
    
    Args:
        msg: Message text
        variant_map: Mapping from username variants to canonical forms
        
    Returns:
        Updated message with standardized username mentions
    """
    for v, canonical in variant_map.items():
        # Replace only whole word matches (case-sensitive)
        msg = re.sub(rf'\b{re.escape(v)}\b', canonical, msg)
    return msg

def standardize_usernames(data: pd.DataFrame, manual_mappings: Dict[str, str] = None) -> pd.DataFrame:
    """
    Standardize usernames in the dataset.
    
    Args:
        data: DataFrame containing chat data
        manual_mappings: Dictionary of manual username mappings
        
    Returns:
        DataFrame with standardized usernames
    """
    # Create a copy to avoid modifying the original
    df = data.copy()
    
    # Apply manual mappings first
    if manual_mappings:
        for original, canonical in manual_mappings.items():
            df.loc[df['user'] == original, 'user'] = canonical
    
    # Find and apply capitalization variants
    unique_users = df['user'].unique()
    duplicate_users, variant_map = find_username_variants(unique_users)
    
    # Log the found variants
    for lower, variants in duplicate_users.items():
        logging.info(f"Found username variants for {lower}: {sorted(variants)}")
    
    # Replace usernames in 'user' column
    df['user'] = df['user'].apply(lambda u: variant_map.get(u, u))
    
    # Replace mentions in 'message' column
    df['message'] = df['message'].apply(lambda msg: replace_mentions(msg, variant_map))
    
    return df

def process_chat_logs(config_path: str = '../../file_list.txt', 
                      mapping_path: str = None,
                      output_path: str = None) -> pd.DataFrame:
    """
    Main function to process chat logs.
    
    Args:
        config_path: Path to the file listing chat logs
        mapping_path: Path to the username mapping file (optional)
        output_path: Path to save the processed data (optional)
        
    Returns:
        DataFrame with processed chat data
    """
    logging.info("Starting chat log processing")
    
    # Read file list
    file_names = read_file_list(config_path)
    if not file_names:
        logging.error("No files to process. Exiting.")
        return pd.DataFrame()
    
    logging.info(f"Found {len(file_names)} files to process")
    
    # Parse chat files
    data = parse_chat_files(file_names)
    if data.empty:
        logging.error("No data was parsed. Exiting.")
        return pd.DataFrame()
    
    logging.info(f"Parsed {len(data)} chat messages")
    
    # Load username mappings
    manual_mappings = load_username_mappings(mapping_path)
    
    # Standardize usernames
    processed_data = standardize_usernames(data, manual_mappings)
    
    # Save processed data if requested
    if output_path:
        try:
            #processed_data.to_csv(output_path, index=False)
            logging.info(f"Saved processed data to {output_path}")
        except Exception as e:
            logging.error(f"Error saving processed data: {e}")
    
    logging.info("Chat log processing completed")
    return processed_data

if __name__ == "__main__":
    # Example usage
    df = process_chat_logs(
        config_path='../../file_list.txt',
        output_path='processed_chat_data.csv'
    )
    
    # Print some statistics
    print(f"Total messages: {len(df)}")
    print(f"Unique users: {df['user'].nunique()}")
    print(f"Streams processed: {df['stream'].nunique()}")

Total messages: 1768831
Unique users: 61957
Streams processed: 319
