# WhatsApp Chat Parsing
This notebook demonstrates how to import and parse a WhatsApp chat export file to extract the date, time, sender's number, and message content.

In [1]:
# Import required libraries
import pandas as pd
import re
import math
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotting import plot_beer_counts, estimate_time_to_million_beers
from data_cleaning import import_and_clean_chat

from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

## Read the chat file
Read the chat file into a list of lines for processing. Clean the data.

In [2]:
file_path = "WhatsApp Chat with 1 Million Beers 20250620 0837.zip"

chat_df = import_and_clean_chat(file_path)



In [3]:
chat_df

Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour,error_low,error_high,outlier,error
10,25/05/2025,13:25,+44 7460 901716,23703,contains number,23703.0,5,2025-05-25 13:25:00,2025-05-25 13:00:00,False,False,False,none
14,25/05/2025,13:28,+44 7897 909912,23709,contains number,23709.0,5,2025-05-25 13:28:00,2025-05-25 13:00:00,False,False,False,none
17,25/05/2025,13:35,+44 7415 324822,23711,contains number,23711.0,5,2025-05-25 13:35:00,2025-05-25 13:00:00,False,False,False,none
21,25/05/2025,13:37,+44 7412 898559,23712,contains number,23712.0,5,2025-05-25 13:37:00,2025-05-25 13:00:00,False,False,False,none
27,25/05/2025,13:40,+44 7966 072012,23717,contains number,23717.0,5,2025-05-25 13:40:00,2025-05-25 13:00:00,False,False,False,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11980,20/06/2025,06:51,+44 7897 909912,35078,contains number,35078.0,227,2025-06-20 06:51:00,2025-06-20 06:00:00,False,False,False,none
11981,20/06/2025,06:58,+44 7875 595885,That‚Äôs 10k in 3 weeks,contains number,3.0,227,2025-06-20 06:58:00,2025-06-20 06:00:00,True,False,False,low
11984,20/06/2025,07:00,+61 400 523 189,35079,contains number,35079.0,227,2025-06-20 07:00:00,2025-06-20 07:00:00,False,False,False,none
11985,20/06/2025,07:03,+44 7897 909912,December 2030,contains number,2030.0,227,2025-06-20 07:03:00,2025-06-20 07:00:00,True,False,False,low


In [4]:
fig = plot_beer_counts(chat_df, color='error')

fig.show()

In [5]:
# Manually remove the remaining outliers
chat_df = chat_df[chat_df['datetime'] != pd.Timestamp('2025-06-14 17:43:00')]
chat_df = chat_df[chat_df['datetime'] != pd.Timestamp('2025-06-19 23:45:00')]

## Rate of Beers
The first few parsed messages are shown below.

In [6]:
# Linear estimate of rate of beers using sklearn and added_df_no_outliers
from sklearn.linear_model import LinearRegression
import numpy as np

data_cleaned = chat_df[chat_df['error']=='none'].copy()

# Prepare X as hours since start, y as n_beers
X = (data_cleaned['hour'] - data_cleaned['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600
y = data_cleaned['n_beers'].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)
rate_per_hour = model.coef_[0]
intercept = model.intercept_

print(f"Estimated rate of beers per hour: {rate_per_hour:.2f}")
print(f"Intercept: {intercept:.2f}")

# Add linear estimate to DataFrame for plotting
data_cleaned.loc[:,'linear_estimate'] = model.predict(X)

# Create plot using plot_beer_counts and add linear estimate
fig = plot_beer_counts(data_cleaned, title='n_beers üçª (cleaned outliers) with Linear Forecast')

# Add linear estimate line
fig.add_trace(go.Scatter(
    x=data_cleaned['datetime'],
    y=data_cleaned['linear_estimate'],
    mode='lines',
    name='Linear Forecast',
    line=dict(dash='dash')
))

# Update legend position
fig.update_layout(legend=dict(x=1, y=0, traceorder='normal', xanchor='right', yanchor='bottom'))
fig.show()

Estimated rate of beers per hour: 19.42
Intercept: 23621.05


In [7]:
# Next round 10k
start = data_cleaned['n_beers'].max()
next_10k = math.ceil(start / 10000) * 10000
rate_per_hour, days_needed, years_needed, date_estimate, start, target = estimate_time_to_million_beers(data_cleaned, rate_per_hour, next_10k)
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {days_needed:.1f} days to go from {start:,.0f} to {target:,} beers.")

# Target 1,000,000
rate_per_hour, days_needed, years_needed, date_estimate, start, target = estimate_time_to_million_beers(data_cleaned, rate_per_hour)
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {years_needed:.1f} years ({date_estimate.date()}) to reach {target:,} beers (from {start:,.0f}).")

At a rate of 19.42 beers/hour, it will take approximately 10.6 days to go from 35,079 to 40,000 beers.
At a rate of 19.42 beers/hour, it will take approximately 5.7 years (2031-02-19) to reach 1,000,000 beers (from 35,079).
