# WhatsApp Chat Parsing
This notebook demonstrates how to import and parse a WhatsApp chat export file to extract the date, time, sender's number, and message content.

In [1]:
# Import required libraries
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from whatsapp_parser import load_whatsapp_chat, load_whatsapp_chat_from_bucket, parse_chat_lines, process_chat_data
from plotting import plot_beer_counts
from data_cleaning import beer_errors, flag_outliers

from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

## Read the chat file
Read the chat file into a list of lines for processing.

In [2]:
file_path = "WhatsApp Chat with 1 Million Beers 20250615 1357.zip"
lines = load_whatsapp_chat_from_bucket('1-million-beers', file_path)

# file_path = 'WhatsApp Chat with 1 Million Beers.txt'
# lines = load_whatsapp_chat(file_path)



## Parse the chat lines
Extract the date, time, sender's number, and message from each line using regular expressions. This assumes the chat export format is standard (e.g., "MM/DD/YY, HH:MM - Number: Message").

In [3]:
# Parse chat lines and process the data
chat_df = parse_chat_lines(lines)
chat_df = process_chat_data(chat_df)

# Filter for rows with a valid n_beers value
chat_df = chat_df.dropna(subset=["n_beers"])

chat_df

  chat_df['datetime'] = pd.to_datetime(chat_df['date'] + ' ' + chat_df['time'], errors='coerce')


Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour
10,25/05/2025,13:25,+44 7460 901716,23703,contains number,23703.0,5,2025-05-25 13:25:00,2025-05-25 13:00:00
14,25/05/2025,13:28,+44 7897 909912,23709,contains number,23709.0,5,2025-05-25 13:28:00,2025-05-25 13:00:00
17,25/05/2025,13:35,+44 7415 324822,23711,contains number,23711.0,5,2025-05-25 13:35:00,2025-05-25 13:00:00
21,25/05/2025,13:37,+44 7412 898559,23712,contains number,23712.0,5,2025-05-25 13:37:00,2025-05-25 13:00:00
27,25/05/2025,13:40,+44 7966 072012,23717,contains number,23717.0,5,2025-05-25 13:40:00,2025-05-25 13:00:00
...,...,...,...,...,...,...,...,...,...
10481,15/06/2025,13:11,+44 7443 571912,33704,contains number,33704.0,207,2025-06-15 13:11:00,2025-06-15 13:00:00
10484,15/06/2025,13:16,+44 7709 110129,33706,contains number,33706.0,207,2025-06-15 13:16:00,2025-06-15 13:00:00
10488,15/06/2025,13:25,Henry,33708,contains number,33708.0,207,2025-06-15 13:25:00,2025-06-15 13:00:00
10494,15/06/2025,13:47,+31 6 42948260,33715,contains number,33715.0,207,2025-06-15 13:47:00,2025-06-15 13:00:00


In [4]:
# Show flag counts
chat_df['flag'].value_counts()

flag
contains number    2027
Name: count, dtype: int64

### Data Cleaning

In [5]:
plot_beer_counts(chat_df)

  v = v.dt.to_pydatetime()


In [6]:
# Errors where n_beers is too high or too low
chat_df = beer_errors(chat_df)

# Flag outliers based a linear trend
chat_df = flag_outliers(chat_df)

chat_df['error'] = np.select(
    [chat_df['error_high'], chat_df['error_low'], chat_df['outlier']],
    ['high', 'low', 'outlier'],
    default='none'
)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [7]:
plot_beer_counts(chat_df, color='error')




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



## Rate of Beers
The first few parsed messages are shown below.

In [8]:
# Linear estimate of rate of beers using sklearn and added_df_no_outliers
from sklearn.linear_model import LinearRegression
import numpy as np

data_cleaned = chat_df[chat_df['error']=='none'].copy()

# Prepare X as hours since start, y as n_beers
X = (data_cleaned['hour'] - data_cleaned['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600
y = data_cleaned['n_beers'].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)
rate_per_hour = model.coef_[0]
intercept = model.intercept_

print(f"Estimated rate of beers per hour: {rate_per_hour:.2f}")
print(f"Intercept: {intercept:.2f}")

# Add linear estimate to DataFrame for plotting
data_cleaned.loc[:,'linear_estimate'] = model.predict(X)

# Create plot using plot_beer_counts and add linear estimate
fig = plot_beer_counts(data_cleaned, title='n_beers 🍻 (cleaned outliers) with Linear Forecast')

# Add linear estimate line
import plotly.graph_objects as go
fig.add_trace(go.Scatter(
    x=data_cleaned['datetime'], 
    y=data_cleaned['linear_estimate'], 
    mode='lines', 
    name='Linear Estimate (sklearn)',
    line=dict(dash='dash')
))

# Update legend position
fig.update_layout(legend=dict(x=1, y=0, traceorder='normal', xanchor='right', yanchor='bottom'))
fig.show()

Estimated rate of beers per hour: 19.78
Intercept: 23543.27



The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [9]:
data_cleaned.tail()

Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour,error_high,error_low,outlier,error,linear_estimate
10451,15/06/2025,11:28,+44 7949 773920,33680,contains number,33680.0,207,2025-06-15 11:28:00,2025-06-15 11:00:00,False,False,False,none,33470.679419
10468,15/06/2025,12:17,+44 7824 726624,33692,contains number,33692.0,207,2025-06-15 12:17:00,2025-06-15 12:00:00,False,False,False,none,33490.455131
10478,15/06/2025,12:51,+44 7824 726624,33702,contains number,33702.0,207,2025-06-15 12:51:00,2025-06-15 12:00:00,False,False,False,none,33490.455131
10481,15/06/2025,13:11,+44 7443 571912,33704,contains number,33704.0,207,2025-06-15 13:11:00,2025-06-15 13:00:00,False,False,False,none,33510.230844
10484,15/06/2025,13:16,+44 7709 110129,33706,contains number,33706.0,207,2025-06-15 13:16:00,2025-06-15 13:00:00,False,False,False,none,33510.230844


In [10]:
# Calculate days to reach 1,000,000 from current max n_beers
start = data_cleaned['n_beers'].max()
end = 1000000
beers_needed = end - start
hours_needed = beers_needed / rate_per_hour if rate_per_hour > 0 else np.nan
days_needed = hours_needed / 24
years_needed = days_needed / 365
date_estimate = chat_df['datetime'].max() + pd.Timedelta(days=days_needed)
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {days_needed:.1f} days to go from {start:,.0f} to 1,000,000 beers.")
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {years_needed:.1f} years ({date_estimate.date()}) to reach 1,000,000 beers (from {start:,.0f}).")

At a rate of 19.78 beers/hour, it will take approximately 2035.5 days to go from 33,923 to 1,000,000 beers.
At a rate of 19.78 beers/hour, it will take approximately 5.6 years (2031-01-11) to reach 1,000,000 beers (from 33,923).
