In [1]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

read file   

In [2]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
processed = Path("../data/processed")
datafile = processed / config["inputpath"]
if not datafile.exists():
    logger.warning(
        f"{datafile} does not exist. Maybe first run src/preprocess.py, or check the timestamp!"
    )

In [3]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,author,message
0,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - ‎Thies Jan Weijmans heeft d...
1,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - Je bent toegevoegd\n
2,2022-09-16 09:26:00,Bas hooge Venterink,Wachten op dit bericht\n
3,2022-09-16 07:31:00,Smeerbeer van Dijk,Wachten op dit bericht\n
4,2022-09-16 07:41:00,Schjöpschen,16-09-2022 07:41 - Schjöpschen: Wachten op dit...


Check the datatypes

In [4]:
df.dtypes

timestamp    datetime64[ns]
author               object
message              object
dtype: object

Clean tilde of author name

In [5]:
import re

clean_tilde = r"^~\u202f"
df["author"] = df["author"].apply(lambda x: re.sub(clean_tilde, "", x))

Check unique author names

In [6]:
len(df.author.unique())

10

Anonomize authors

In [7]:
import json
from wa_analyzer.humanhasher import humanize

authors = df.author.unique()
anon = {k: humanize(k) for k in authors}
# we save a reference file so we can look up the original author names if we want to
reference_file = processed / "anon_reference.json"

with open(reference_file, "w") as f:
    # invert the dictionary:
    ref = {v: k for k, v in anon.items()}
    # sort alphabetically:
    ref_sorted = {k: ref[k] for k in sorted(ref.keys())}
    # save as json:
    json.dump(ref_sorted, f)


if not len(anon) == len(authors):
    raise ValueError("you lost some authors!")

In [8]:
df["anon_author"] = df.author.map(anon)
df.head()

Unnamed: 0,timestamp,author,message,anon_author
0,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - ‎Thies Jan Weijmans heeft d...,glittering-penguin
1,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - Je bent toegevoegd\n,glittering-penguin
2,2022-09-16 09:26:00,Bas hooge Venterink,Wachten op dit bericht\n,eye-catching-wolf
3,2022-09-16 07:31:00,Smeerbeer van Dijk,Wachten op dit bericht\n,spattered-duck
4,2022-09-16 07:41:00,Schjöpschen,16-09-2022 07:41 - Schjöpschen: Wachten op dit...,riotous-dingo


Drop original author

In [9]:
df.drop(columns=["author"], inplace=True)

check if original author is gone

In [None]:
df.head()

rename new author to author

In [10]:
df.rename(columns={"anon_author": "author"}, inplace=True)

Remove header of data which is a generate chat

In [11]:
df = df.drop(index=[0])

Find emoji's and add that as feature

In [12]:
emoji_pattern = re.compile(
    "["
    "\U0001f600-\U0001f64f"  # emoticons
    "\U0001f300-\U0001f5ff"  # symbols & pictographs
    "\U0001f680-\U0001f6ff"  # transport & map symbols
    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
    "\U00002702-\U000027b0"  # Dingbats
    "\U000024c2-\U0001f251"
    "]+",
    flags=re.UNICODE,
)


def has_emoji(text):
    return bool(emoji_pattern.search(text))


df["has_emoji"] = df["message"].apply(has_emoji)

Create timestamp

In [13]:
import pytz

now = datetime.now(tz=pytz.timezone('Europe/Amsterdam')).strftime("%Y%m%d-%H%M%S")
now

'20250923-192055'

In [14]:
output = processed / f"whatsapp-{now}.csv"
output

PosixPath('../data/processed/whatsapp-20250923-192055.csv')

Save as parquet and as csv

In [15]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)