# WhatsApp Chat Parser

This notebook parses WhatsApp chat export files and converts them into a structured pandas DataFrame for analysis.

The chat format follows this pattern:
```
[DD/MM/YYYY, HH:MM:SS AM/PM] Username: Message text
```

Messages can span multiple lines, with new messages starting when a line begins with `[`.

## 1. Import Required Libraries

Import pandas, datetime, re, and other necessary libraries for text processing and data manipulation.

## Project Structure

This notebook uses a YAML configuration file (`config.yml`) to manage file paths and settings:

- **Data files** are stored in the `data/` folder
- **Input file**: `data/_chat.txt` (your WhatsApp chat export)
- **Output file**: `data/whatsapp_parsed_data.csv` (parsed DataFrame)
- **Configuration**: `config.yml` (file paths and parsing settings in YAML format)

Make sure your WhatsApp chat file is placed in the `data/` folder before running the notebook.

In [None]:
import pandas as pd
import re
from datetime import datetime
import os
import yaml

# Load configuration from YAML file
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

# Create convenient variables for file paths
DATA_FOLDER = config['data']['folder']
CHAT_FILE_NAME = config['data']['chat_file_name']
OUTPUT_FILE_NAME = config['data']['output_file_name']
CHAT_FILE_PATH = os.path.join(DATA_FOLDER, CHAT_FILE_NAME)
OUTPUT_FILE_PATH = os.path.join(DATA_FOLDER, OUTPUT_FILE_NAME)

# Create data folder if it doesn't exist
os.makedirs(DATA_FOLDER, exist_ok=True)

print("Libraries imported successfully!")

## 2. Define Text Parsing Functions

Create helper functions to parse WhatsApp message format, extract timestamps, usernames, and handle multi-line messages.

In [None]:
def parse_whatsapp_line(line):
    """
    Parse a WhatsApp message line and extract components.
    Returns: (date_str, time_str, user, message) or None if not a valid message line
    """
    # Clean the line first - remove any invisible characters and normalize whitespace
    line = line.strip()
    
    # More flexible pattern to handle variations in spacing and formatting
    # This pattern allows for:
    # - Optional extra spaces
    # - Different date formats (handles single or double digits)
    # - More flexible username matching
    pattern = r'^\[\s*(\d{1,2}/\d{1,2}/\d{4})\s*,\s*(\d{1,2}:\d{2}:\d{2}\s*(?:AM|PM))\s*\]\s*([^:]+?):\s*(.*)$'
    
    match = re.match(pattern, line, re.IGNORECASE)
    
    if match:
        date_str, time_str, user, message = match.groups()
        return date_str.strip(), time_str.strip(), user.strip(), message.strip()
    
    # If the main pattern doesn't work, try a simpler fallback pattern
    # This handles cases where there might be formatting issues
    fallback_pattern = r'^\[([^\]]+)\]\s*([^:]+):\s*(.*)$'
    fallback_match = re.match(fallback_pattern, line)
    
    if fallback_match:
        datetime_part, user, message = fallback_match.groups()
        # Try to split the datetime part
        if ', ' in datetime_part:
            date_str, time_str = datetime_part.split(', ', 1)
            return date_str.strip(), time_str.strip(), user.strip(), message.strip()
    
    return None

def is_message_start(line):
    """
    Check if a line is the start of a new message (begins with '[')
    """
    return line.strip().startswith('[')

def parse_datetime_components(date_str, time_str):
    """
    Parse date and time strings to extract individual components
    """
    try:
        # Parse date: DD/MM/YYYY (handle both single and double digit days/months)
        date_parts = date_str.split('/')
        if len(date_parts) != 3:
            return None, None, None, None, None
            
        day, month, year = map(int, date_parts)
        
        # Create datetime object to get day of week
        dt = datetime(year, month, day)
        day_of_week = dt.strftime('%A')  # Full day name (e.g., 'Monday')
        
        return day_of_week, day, month, year, time_str
    except Exception as e:
        print(f"Error parsing date/time: {date_str}, {time_str} - {e}")
        return None, None, None, None, None

print("Parsing functions defined successfully!")

## 3. Read and Parse Chat File

Read the chat text file and implement logic to identify message boundaries using the '[' character pattern.

In [None]:
# Use the chat file path from config
chat_file_path = CHAT_FILE_PATH

# Check if file exists
if not os.path.exists(chat_file_path):
    print(f"File not found: {chat_file_path}")
    print(f"Please make sure the file '{CHAT_FILE_NAME}' is in the '{DATA_FOLDER}' directory.")
else:
    print(f"Found chat file: {chat_file_path}")
    
    # Read the file and parse messages
    messages = []
    current_message = None
    parse_errors = 0
    
    with open(chat_file_path, 'r', encoding=config['parsing']['encoding']) as file:
        for line_num, line in enumerate(file, 1):
            line = line.rstrip('\n\r')  # Remove newline characters
            
            if is_message_start(line):
                # Save previous message if exists
                if current_message is not None:
                    messages.append(current_message)
                
                # Parse new message
                parsed = parse_whatsapp_line(line)
                
                if parsed:
                    date_str, time_str, user, message = parsed
                    current_message = {
                        'date_str': date_str,
                        'time_str': time_str,
                        'user': user,
                        'message': message,
                        'line_num': line_num
                    }
                else:
                    parse_errors += 1
                    if parse_errors <= 5:  # Show only first 5 errors to avoid spam
                        print(f"Warning: Could not parse line {line_num}: {line[:100]}...")
                    current_message = None
            else:
                # This is a continuation of the previous message
                if current_message is not None:
                    current_message['message'] += '\n' + line
        
        # Don't forget the last message
        if current_message is not None:
            messages.append(current_message)
    
    print(f"Successfully parsed {len(messages)} messages from the chat file.")
    if parse_errors > 0:
        print(f"Total parsing errors: {parse_errors}")
    
    # Display a few sample messages
    if messages:
        print("\nSample messages:")
        for i, msg in enumerate(messages[:config['parsing']['max_sample_messages']]):
            print(f"Message {i+1}:")
            print(f"  Date: {msg['date_str']}")
            print(f"  Time: {msg['time_str']}")
            print(f"  User: {msg['user']}")
            max_length = config['parsing']['max_display_message_length']
            print(f"  Message: {msg['message'][:max_length]}{'...' if len(msg['message']) > max_length else ''}")
            print()
    else:
        print("No messages were successfully parsed. Please check the file format.")

## 4. Clean and Process Data

Clean the extracted data, handle special characters, and process multi-line messages by concatenating them properly.

In [None]:
def clean_message_text(text):
    """
    Clean message text by handling special characters and formatting
    """
    if not text:
        return ""
    
    # Remove extra whitespace while preserving intentional line breaks
    lines = text.split('\n')
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    return '\n'.join(cleaned_lines)

def clean_username(username):
    """
    Clean and standardize usernames
    """
    # Remove any extra whitespace
    username = username.strip()
    
    # Handle any special characters or formatting issues
    return username

# Clean the parsed messages
cleaned_messages = []

for msg in messages:
    cleaned_msg = {
        'date_str': msg['date_str'],
        'time_str': msg['time_str'],
        'user': clean_username(msg['user']),
        'message': clean_message_text(msg['message']),
        'line_num': msg['line_num']
    }
    cleaned_messages.append(cleaned_msg)

print(f"Cleaned {len(cleaned_messages)} messages.")

# Check for unique users
unique_users = set(msg['user'] for msg in cleaned_messages)
print(f"Unique users found: {list(unique_users)}")

# Show some statistics
total_chars = sum(len(msg['message']) for msg in cleaned_messages)
avg_message_length = total_chars / len(cleaned_messages) if cleaned_messages else 0

print(f"Total characters in all messages: {total_chars:,}")
print(f"Average message length: {avg_message_length:.1f} characters")

## 5. Extract Date and Time Components

Parse the date and time strings to extract Day of Week, Day, Month, Year, and Time components using datetime functions.

In [None]:
# Process each message to extract date and time components
processed_data = []

for msg in cleaned_messages:
    day_of_week, day, month, year, time = parse_datetime_components(
        msg['date_str'], msg['time_str']
    )
    
    if day_of_week is not None:  # Successfully parsed
        processed_data.append({
            'DoW': day_of_week,
            'Day': day,
            'Month': month,
            'Year': year,
            'Time': time,
            'User': msg['user'],
            'text_message': msg['message']
        })
    else:
        print(f"Warning: Could not parse date/time for message at line {msg['line_num']}")

print(f"Successfully processed {len(processed_data)} messages with date/time components.")

# Show some examples of the processed data
if processed_data:
    print("\nSample processed data:")
    for i, data in enumerate(processed_data[:3]):
        print(f"Record {i+1}:")
        for key, value in data.items():
            if key == 'text_message':
                display_text = value[:50] + '...' if len(value) > 50 else value
                print(f"  {key}: {display_text}")
            else:
                print(f"  {key}: {value}")
        print()

## 6. Create Final DataFrame

Construct the pandas DataFrame with columns: Day of Week, Day, Month, Year, Time, User, and text_message.

In [None]:
# Create the pandas DataFrame
df = pd.DataFrame(processed_data)

# Ensure the columns are in the correct order
column_order = ['DoW', 'Day', 'Month', 'Year', 'Time', 'User', 'text_message']
df = df[column_order]

# Display basic information about the DataFrame
print("DataFrame created successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Show the first few rows
print("\nFirst 5 rows:")
print(df.head())

# Show data types
print(f"\nData types:")
print(df.dtypes)

## 7. Data Validation and Preview

Validate the parsed data, check for any parsing errors, and display sample rows and basic statistics of the resulting DataFrame.

In [None]:
# Data validation and statistics
print("=== DATA VALIDATION ===")

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print()

# Check unique users
print("Unique users:")
print(df['User'].value_counts())
print()

# Check date range
print("Date range:")
print(f"Years: {sorted(df['Year'].unique())}")
print(f"Months: {sorted(df['Month'].unique())}")
print()

# Message statistics
print("=== MESSAGE STATISTICS ===")
print(f"Total messages: {len(df):,}")
print(f"Messages per user:")
user_counts = df['User'].value_counts()
for user, count in user_counts.items():
    print(f"  {user}: {count:,} ({count/len(df)*100:.1f}%)")
print()

# Text length statistics
df['message_length'] = df['text_message'].str.len()
print("Message length statistics:")
print(f"  Average: {df['message_length'].mean():.1f} characters")
print(f"  Median: {df['message_length'].median():.1f} characters")
print(f"  Max: {df['message_length'].max():,} characters")
print(f"  Min: {df['message_length'].min()} characters")
print()

# Day of week distribution
print("Messages by day of week:")
day_counts = df['DoW'].value_counts()
# Sort by day of week order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for day in day_order:
    if day in day_counts:
        count = day_counts[day]
        print(f"  {day}: {count:,} ({count/len(df)*100:.1f}%)")

# Sample of longest and shortest messages
print("\n=== SAMPLE MESSAGES ===")
print("Longest message:")
longest_idx = df['message_length'].idxmax()
longest_msg = df.loc[longest_idx]
print(f"User: {longest_msg['User']}")
print(f"Date: {longest_msg['Day']}/{longest_msg['Month']}/{longest_msg['Year']}")
print(f"Length: {longest_msg['message_length']} characters")
print(f"Message: {longest_msg['text_message'][:200]}{'...' if len(longest_msg['text_message']) > 200 else ''}")
print()

print("Shortest non-empty message:")
non_empty = df[df['message_length'] > 0]
if not non_empty.empty:
    shortest_idx = non_empty['message_length'].idxmin()
    shortest_msg = df.loc[shortest_idx]
    print(f"User: {shortest_msg['User']}")
    print(f"Date: {shortest_msg['Day']}/{shortest_msg['Month']}/{shortest_msg['Year']}")
    print(f"Length: {shortest_msg['message_length']} characters")
    print(f"Message: '{shortest_msg['text_message']}'")

# Drop the temporary column
df = df.drop('message_length', axis=1)

In [None]:
# Save the DataFrame to CSV for further analysis using config path
output_filename = OUTPUT_FILE_PATH
df.to_csv(output_filename, index=False, encoding=config['parsing']['encoding'])
print(f"DataFrame saved to: {output_filename}")

# Display final DataFrame info
print(f"\nFinal DataFrame shape: {df.shape}")
print("Ready for analysis!")

# Show a sample of the final data
print("\nSample of final data:")
print(df.head(10))