# WhatsApp Chat Parsing
This notebook demonstrates how to import and parse a WhatsApp chat export file to extract the date, time, sender's number, and message content.

In [1]:
# Import required libraries
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# Define the path to the WhatsApp chat file
file_path = 'WhatsApp Chat with 1 Million Beers.txt'

## Read the chat file
Read the chat file into a list of lines for processing.

In [2]:
# Read the chat file
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

## Parse the chat lines
Extract the date, time, sender's number, and message from each line using regular expressions. This assumes the chat export format is standard (e.g., "MM/DD/YY, HH:MM - Number: Message").

In [3]:
# Updated regex to match both messages with and without a colon after the sender
pattern = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}) - ([^:]+?)(?:: (.*)| added .*)$')

parsed_data = []
for line in lines:
    match = pattern.match(line)
    if match:
        date, time, sender, message = match.groups()
        # If message is None, reconstruct for 'added' or similar system messages
        if message is None:
            # Remove date, time, and sender from the line to get the message
            message = line.split(' - ', 1)[-1].split(': ', 1)[-1] if ': ' in line else line.split(' - ', 1)[-1]
        parsed_data.append({
            'date': date,
            'time': time,
            'number': sender,
            'message': message.strip()
        })

# Convert to DataFrame
chat_df = pd.DataFrame(parsed_data)
chat_df

Unnamed: 0,date,time,number,message
0,25/05/2025,13:22,Walter Milez,Walter Milez added you
1,25/05/2025,13:22,Walter Milez,Walter Milez added Matt Powrie
2,25/05/2025,13:22,Walter Milez,Walter Milez added Frank Frankland
3,25/05/2025,13:22,+34 635 07 00 16,+34 635 07 00 16 added ~ A
4,25/05/2025,13:22,~ KA,~ KA added ~ Will
...,...,...,...,...
3674,01/06/2025,14:03,+44 7915 072000,27124
3675,01/06/2025,14:06,+44 7975 956822,<Media omitted>
3676,01/06/2025,14:08,+44 7946 365381,<Media omitted>
3677,01/06/2025,14:10,+44 7814 445887,<Media omitted>


In [4]:
def flag_message(msg):
    # Exclude numbers prefixed with @ from 'contains number' flag
    # Find all numbers not prefixed by @
    has_non_at_number = bool(re.search(r'(?<!@)(?<!@ )\b(\d+)\b', msg))
    if re.search(r'\badded\b', msg, re.IGNORECASE):
        return 'added'
    if re.search(r'\bremoved\b', msg, re.IGNORECASE):
        return 'removed'
    elif has_non_at_number:
        return 'contains number'
    else:
        return 'other'

chat_df['flag'] = chat_df['message'].apply(flag_message)

# Extract the first number from messages flagged as 'contains number'
def extract_number(msg):
    # Exclude numbers prefixed with @
    matches = re.findall(r'(?<!@)(?<!@ )\b(\d+)\b', msg)
    return int(matches[0]) if matches else None

chat_df['n_beers'] = chat_df.apply(lambda row: extract_number(row['message']) if row['flag'] == 'contains number' else None, axis=1)

chat_df['n_added'] = (chat_df['flag'] == 'added').cumsum() - (chat_df['flag'] == 'removed').cumsum()
chat_df.head()

Unnamed: 0,date,time,number,message,flag,n_beers,n_added
0,25/05/2025,13:22,Walter Milez,Walter Milez added you,added,,1
1,25/05/2025,13:22,Walter Milez,Walter Milez added Matt Powrie,added,,2
2,25/05/2025,13:22,Walter Milez,Walter Milez added Frank Frankland,added,,3
3,25/05/2025,13:22,+34 635 07 00 16,+34 635 07 00 16 added ~ A,added,,4
4,25/05/2025,13:22,~ KA,~ KA added ~ Will,added,,5


In [5]:
# Count
chat_df['flag'].value_counts()

flag
other              2925
contains number     677
added                77
Name: count, dtype: int64

## Rate of Beers
The first few parsed messages are shown below.

In [6]:
# Scatter plot of number and hour for 'added' rows using plotly

# Ensure 'datetime' column exists
chat_df['datetime'] = pd.to_datetime(chat_df['date'] + ' ' + chat_df['time'], errors='coerce')

# Filter for rows with a valid n_beers value
added_df = chat_df.dropna(subset=["n_beers"])
added_df = added_df.sort_values('datetime')
added_df['hour'] = added_df['datetime'].dt.floor('h')

# drop rows where n_beers is greater than 100_000
added_df = added_df[added_df['n_beers'] <= 100_000]


fig = px.scatter(
    added_df,
    x='hour',
    y='n_beers',
    labels={'hour': 'Hour', 'n_beers': '🍻'},
    title='Scatter Plot of Beers over time 🍻',
    hover_data=['message', 'n_beers'],
)
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),
    xaxis_title='Time',
    yaxis_title='Number of Beers 🍻'
)
fig.show()

  chat_df['datetime'] = pd.to_datetime(chat_df['date'] + ' ' + chat_df['time'], errors='coerce')


In [7]:
# Remove outliers from added_df based on linear trend
from sklearn.linear_model import LinearRegression

# Fit linear regression to hour vs n_beers
X = (added_df['hour'] - added_df['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600  # hours since start
y = added_df['n_beers'].values
model = LinearRegression()
model.fit(X, y)
pred = model.predict(X)
residuals = y - pred
std_resid = np.std(residuals)

# Keep only points within 3 standard deviations of the trend
added_df_no_outliers = added_df[np.abs(residuals) <= 3 * std_resid]

# Optionally, display or plot added_df_no_outliers
added_df_no_outliers.head()

Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour
10,25/05/2025,13:25,+44 7460 901716,23703,contains number,23703.0,5,2025-05-25 13:25:00,2025-05-25 13:00:00
14,25/05/2025,13:28,+44 7897 909912,23709,contains number,23709.0,5,2025-05-25 13:28:00,2025-05-25 13:00:00
17,25/05/2025,13:35,+44 7415 324822,23711,contains number,23711.0,5,2025-05-25 13:35:00,2025-05-25 13:00:00
21,25/05/2025,13:37,+44 7412 898559,23712,contains number,23712.0,5,2025-05-25 13:37:00,2025-05-25 13:00:00
27,25/05/2025,13:40,+44 7966 072012,23717,contains number,23717.0,5,2025-05-25 13:40:00,2025-05-25 13:00:00


In [8]:

fig = px.scatter(
    added_df_no_outliers,
    x='hour',
    y='n_beers',
    labels={'hour': 'Hour', 'n_beers': '🍻'},
    title='Scatter Plot of Beers over time 🍻',
    hover_data=['message', 'n_beers'],
)
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),
    xaxis_title='Time',
    yaxis_title='Number of Beers 🍻'
)
fig.show()

In [9]:
# Linear estimate of rate of beers using sklearn and added_df_no_outliers
from sklearn.linear_model import LinearRegression
import numpy as np

# Prepare X as hours since start, y as n_beers
X = (added_df_no_outliers['hour'] - added_df_no_outliers['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600
y = added_df_no_outliers['n_beers'].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)
rate_per_hour = model.coef_[0]
intercept = model.intercept_

print(f"Estimated rate of beers per hour: {rate_per_hour:.2f}")
print(f"Intercept: {intercept:.2f}")

# Add linear estimate to DataFrame for plotting
added_df_no_outliers['linear_estimate'] = model.predict(X)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=added_df_no_outliers['hour'], y=added_df_no_outliers['n_beers'], mode='markers', name='Actual'))
fig.add_trace(go.Scatter(x=added_df_no_outliers['hour'], y=added_df_no_outliers['linear_estimate'], mode='lines', name='Linear Estimate (sklearn)', line=dict(dash='dash')))
fig.update_layout(
    title='n_beers per Hour (No Outliers) with Linear Estimate (sklearn)',
    xaxis_title='Hour',
    yaxis_title='n_beers',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black')
)
fig.show()

Estimated rate of beers per hour: 15.85
Intercept: 23765.36




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
# Calculate days to reach 30,000 from 29,584 at the estimated rate
start = 29584
end = 30000
beers_needed = end - start
hours_needed = beers_needed / rate_per_hour if rate_per_hour > 0 else np.nan
days_needed = hours_needed / 24
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {days_needed:.2f} days to go from 29,584 to 30,000 beers.")
days_needed

At a rate of 15.85 beers/hour, it will take approximately 1.09 days to go from 29,584 to 30,000 beers.


1.0934727325730027