# WhatsApp Chat Parsing
This notebook demonstrates how to import and parse a WhatsApp chat export file to extract the date, time, sender's number, and message content.

In [1]:
# Import required libraries
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from whatsapp_parser import load_whatsapp_chat, parse_chat_lines, process_chat_data
from plotting import plot_beer_counts
from data_cleaning import beer_errors, flag_outliers

# Define the path to the WhatsApp chat file
file_path = 'WhatsApp Chat with 1 Million Beers.txt'

%load_ext autoreload
%autoreload 2

## Read the chat file
Read the chat file into a list of lines for processing.

In [2]:
# Read the chat file
lines = load_whatsapp_chat(file_path)

## Parse the chat lines
Extract the date, time, sender's number, and message from each line using regular expressions. This assumes the chat export format is standard (e.g., "MM/DD/YY, HH:MM - Number: Message").

In [3]:
# Parse chat lines and process the data
chat_df = parse_chat_lines(lines)
chat_df = process_chat_data(chat_df)

# Filter for rows with a valid n_beers value
chat_df = chat_df.dropna(subset=["n_beers"])

chat_df

  chat_df['datetime'] = pd.to_datetime(chat_df['date'] + ' ' + chat_df['time'], errors='coerce')


Unnamed: 0,date,time,number,message,flag,n_beers,n_added,datetime,hour
10,25/05/2025,13:25,+44 7460 901716,23703,contains number,23703.0,5,2025-05-25 13:25:00,2025-05-25 13:00:00
14,25/05/2025,13:28,+44 7897 909912,23709,contains number,23709.0,5,2025-05-25 13:28:00,2025-05-25 13:00:00
17,25/05/2025,13:35,+44 7415 324822,23711,contains number,23711.0,5,2025-05-25 13:35:00,2025-05-25 13:00:00
21,25/05/2025,13:37,+44 7412 898559,23712,contains number,23712.0,5,2025-05-25 13:37:00,2025-05-25 13:00:00
27,25/05/2025,13:40,+44 7966 072012,23717,contains number,23717.0,5,2025-05-25 13:40:00,2025-05-25 13:00:00
...,...,...,...,...,...,...,...,...,...
6438,07/06/2025,10:09,+44 7523 047707,29565,contains number,29565.0,165,2025-06-07 10:09:00,2025-06-07 10:00:00
6440,07/06/2025,10:33,+44 7956 904383,29569,contains number,29569.0,165,2025-06-07 10:33:00,2025-06-07 10:00:00
6444,07/06/2025,11:06,+31 6 36581481,29572,contains number,29572.0,165,2025-06-07 11:06:00,2025-06-07 11:00:00
6446,07/06/2025,11:11,+44 7956 904383,29578,contains number,29578.0,165,2025-06-07 11:11:00,2025-06-07 11:00:00


In [4]:
# Show flag counts
chat_df['flag'].value_counts()

flag
contains number    1221
Name: count, dtype: int64

### Data Cleaning

In [5]:
plot_beer_counts(chat_df)

In [6]:
# Errors where n_beers is too high or too low
chat_df = beer_errors(chat_df)

# Flag outliers based a linear trend
chat_df = flag_outliers(chat_df)

chat_df['error'] = np.select(
    [chat_df['error_high'], chat_df['error_low'], chat_df['outlier']],
    ['high', 'low', 'outlier'],
    default='none'
)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [7]:
plot_beer_counts(chat_df, color='error')

## Rate of Beers
The first few parsed messages are shown below.

In [8]:
# Linear estimate of rate of beers using sklearn and added_df_no_outliers
from sklearn.linear_model import LinearRegression
import numpy as np

data_cleaned = chat_df[chat_df['error']=='none'].copy()

# Prepare X as hours since start, y as n_beers
X = (data_cleaned['hour'] - data_cleaned['hour'].min()).dt.total_seconds().values.reshape(-1, 1) / 3600
y = data_cleaned['n_beers'].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)
rate_per_hour = model.coef_[0]
intercept = model.intercept_

print(f"Estimated rate of beers per hour: {rate_per_hour:.2f}")
print(f"Intercept: {intercept:.2f}")

# Add linear estimate to DataFrame for plotting
data_cleaned.loc[:,'linear_estimate'] = model.predict(X)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=data_cleaned['hour'], y=data_cleaned['n_beers'], mode='markers', name='Actual'))
fig.add_trace(go.Scatter(x=data_cleaned['hour'], y=data_cleaned['linear_estimate'], mode='lines', name='Linear Estimate (sklearn)', line=dict(dash='dash')))
fig.update_layout(
    title='n_beers per Hour (No Outliers) with Linear Estimate (sklearn)',
    xaxis_title='Hour',
    yaxis_title='n_beers',
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black')
)
#legend bottom right, on top of the plot
fig.update_layout(legend=dict(x=1, y=0, traceorder='normal', xanchor='right', yanchor='bottom'))
fig.show()

Estimated rate of beers per hour: 18.31
Intercept: 23698.66


In [9]:
# Calculate days to reach 30,000 from 29,584 at the estimated rate
start = 29584
end = 30000
beers_needed = end - start
hours_needed = beers_needed / rate_per_hour if rate_per_hour > 0 else np.nan
days_needed = hours_needed / 24
print(f"At a rate of {rate_per_hour:.2f} beers/hour, it will take approximately {days_needed:.2f} days to go from 29,584 to 30,000 beers.")
days_needed

At a rate of 18.31 beers/hour, it will take approximately 0.95 days to go from 29,584 to 30,000 beers.


0.9468764753665649