# WhatsApp Chat Parsing
This notebook demonstrates how to import and parse a WhatsApp chat export file to extract the date, time, sender's number, and message content.

In [1]:
# Import required libraries
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from whatsapp_parser import load_whatsapp_chat, load_whatsapp_chat_from_bucket, parse_chat_lines, process_chat_data
from plotting import plot_beer_counts
from data_cleaning import beer_errors, flag_outliers

from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

## Read the chat file
Read the chat file into a list of lines for processing.

In [2]:
file_path = "WhatsApp Chat with 1 Million Beers 20250608 1306.zip"
lines = load_whatsapp_chat_from_bucket('1-million-beers', file_path)

# file_path = 'WhatsApp Chat with 1 Million Beers.txt'
# lines = load_whatsapp_chat(file_path)



## Parse the chat lines
Extract the date, time, sender's number, and message from each line using regular expressions. This assumes the chat export format is standard (e.g., "MM/DD/YY, HH:MM - Number: Message").

In [3]:
# Parse chat lines and process the data
chat_df = parse_chat_lines(lines)
chat_df = process_chat_data(chat_df)

# Filter for rows with a valid n_beers value
chat_df = chat_df.dropna(subset=["n_beers"])

chat_df

  chat_df['datetime'] = pd.to_datetime(chat_df['date'] + ' ' + chat_df['time'], errors='coerce')


Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour
10,25/05/2025,13:25,+44 7460 901716,23703,contains number,23703.0,5,2025-05-25 13:25:00,2025-05-25 13:00:00
14,25/05/2025,13:28,+44 7897 909912,23709,contains number,23709.0,5,2025-05-25 13:28:00,2025-05-25 13:00:00
17,25/05/2025,13:35,+44 7415 324822,23711,contains number,23711.0,5,2025-05-25 13:35:00,2025-05-25 13:00:00
21,25/05/2025,13:37,+44 7412 898559,23712,contains number,23712.0,5,2025-05-25 13:37:00,2025-05-25 13:00:00
27,25/05/2025,13:40,+44 7966 072012,23717,contains number,23717.0,5,2025-05-25 13:40:00,2025-05-25 13:00:00
...,...,...,...,...,...,...,...,...,...
7422,08/06/2025,10:07,+44 7990 704830,30643,contains number,30643.0,183,2025-06-08 10:07:00,2025-06-08 10:00:00
7430,08/06/2025,11:44,+31 6 42948260,30650,contains number,30650.0,183,2025-06-08 11:44:00,2025-06-08 11:00:00
7434,08/06/2025,12:21,+44 7949 558235,30657,contains number,30.0,183,2025-06-08 12:21:00,2025-06-08 12:00:00
7439,08/06/2025,12:36,+44 7415 324822,30664,contains number,30664.0,183,2025-06-08 12:36:00,2025-06-08 12:00:00


In [4]:
# Show flag counts
chat_df['flag'].value_counts()

flag
contains number    1411
Name: count, dtype: int64

### Data Cleaning

In [None]:
plot_beer_counts(chat_df)

  v = v.dt.to_pydatetime()


In [None]:
# Errors where n_beers is too high or too low
chat_df = beer_errors(chat_df)

# Flag outliers based a linear trend
chat_df = flag_outliers(chat_df)

chat_df['error'] = np.select(
    [chat_df['error_high'], chat_df['error_low'], chat_df['outlier']],
    ['high', 'low', 'outlier'],
    default='none'
)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [None]:
plot_beer_counts(chat_df, color='error')




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



## Rate of Beers
The first few parsed messages are shown below.

In [None]:
# Linear estimate of rate of beers using sklearn and added_df_no_outliers
from sklearn.linear_model import LinearRegression
import numpy as np

data_cleaned = chat_df[chat_df['error']=='none'].copy()

# Prepare X as hours since start, y as n_beers
X = (data_cleaned['hour'] - data_cleaned['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600
y = data_cleaned['n_beers'].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)
rate_per_hour = model.coef_[0]
intercept = model.intercept_

print(f"Estimated rate of beers per hour: {rate_per_hour:.2f}")
print(f"Intercept: {intercept:.2f}")

# Add linear estimate to DataFrame for plotting
data_cleaned.loc[:,'linear_estimate'] = model.predict(X)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_cleaned['hour'], y=data_cleaned['n_beers'], mode='markers', name='Actual'))
fig.add_trace(go.Scatter(x=data_cleaned['hour'], y=data_cleaned['linear_estimate'], mode='lines', name='Linear Estimate (sklearn)', line=dict(dash='dash')))
fig.update_layout(
    title='n_beers 🍻 (cleaned outliers) with Linear Forecast',
    xaxis_title='Hour',
    yaxis_title='n_beers 🍻',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black')
)
#legend bottom right, on top of the plot
fig.update_layout(legend=dict(x=1, y=0, traceorder='normal', xanchor='right', yanchor='bottom'))
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



Estimated rate of beers per hour: 19.27
Intercept: 23597.26


In [None]:
data_cleaned.tail()

Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour,error_high,error_low,outlier,error,linear_estimate
7356,07/06/2025,23:33,+44 7824 726624,30447,contains number,30447.0,183,2025-06-07 23:33:00,2025-06-07 23:00:00,False,False,False,none,29801.461867
7382,08/06/2025,01:36,+353 83 321 3524,30468,contains number,30468.0,183,2025-06-08 01:36:00,2025-06-08 01:00:00,False,False,False,none,29839.997313
7385,08/06/2025,01:57,+44 7398 677620,30470,contains number,30470.0,183,2025-06-08 01:57:00,2025-06-08 01:00:00,False,False,False,none,29839.997313
7388,08/06/2025,03:38,+61 400 523 189,30475,contains number,30475.0,183,2025-06-08 03:38:00,2025-06-08 03:00:00,False,False,False,none,29878.532758
7418,08/06/2025,09:43,+44 7966 072012,30641,contains number,30641.0,183,2025-06-08 09:43:00,2025-06-08 09:00:00,False,False,False,none,29994.139095


In [None]:
# Calculate days to reach 1,000,000 from current max n_beers
start = data_cleaned['n_beers'].max()
end = 1000000
beers_needed = end - start
hours_needed = beers_needed / rate_per_hour if rate_per_hour > 0 else np.nan
days_needed = hours_needed / 24
years_needed = days_needed / 365
date_estimate = chat_df['datetime'].max() + pd.Timedelta(days=days_needed)
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {days_needed:.1f} days to go from {start:,.0f} to 1,000,000 beers.")
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {years_needed:.1f} years ({date_estimate.date()}) to reach 1,000,000 beers (from {start:,.0f}).")

At a rate of 19.27 beers/hour, it will take approximately 2096.2 days to go from 30,641 to 1,000,000 beers.
At a rate of 19.27 beers/hour, it will take approximately 5.7 years (2031-03-05) to reach 1,000,000 beers (from 30,641).


2096.249709604058