In [23]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

read file   

In [24]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
processed = Path("../data/processed")
datafile = processed / config["inputpath"]
if not datafile.exists():
    logger.warning(
        f"{datafile} does not exist. Maybe first run src/preprocess.py, or check the timestamp!"
    )

In [25]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,author,message
0,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - ‎Thies Jan Weijmans heeft d...
1,2017-05-12 17:43:00,Unknown,12-05-2017 17:43 - Je bent toegevoegd\n
2,2022-09-16 09:26:00,Bas hooge Venterink,Wachten op dit bericht\n
3,2022-09-16 07:31:00,Smeerbeer van Dijk,Wachten op dit bericht\n
4,2022-09-16 07:41:00,Schjöpschen,16-09-2022 07:41 - Schjöpschen: Wachten op dit...


In [26]:
df.count()

timestamp    10143
author       10143
message      10143
dtype: int64

Remove texts from Unknown author

In [27]:
df = df[df['author'] != 'Unknown']
df.head()

Unnamed: 0,timestamp,author,message
2,2022-09-16 09:26:00,Bas hooge Venterink,Wachten op dit bericht\n
3,2022-09-16 07:31:00,Smeerbeer van Dijk,Wachten op dit bericht\n
4,2022-09-16 07:41:00,Schjöpschen,16-09-2022 07:41 - Schjöpschen: Wachten op dit...
5,2022-09-16 08:21:00,Jop van der Woning,Wachten op dit bericht\n
6,2022-09-16 08:23:00,Thomas Grundel,Wachten op dit bericht\n


In [28]:
df.count()

timestamp    10137
author       10137
message      10137
dtype: int64

Remove texts with no message 

In [30]:
df = df[~df['message'].str.contains('Wachten op dit bericht', na=False)]
df.head()

Unnamed: 0,timestamp,author,message
12,2022-09-17 16:05:00,Bas hooge Venterink,Mensen die vnv nog willen chillen?\n
13,2022-09-17 16:06:00,Thomas Grundel,Ik ben nog in Haarlem/Amsterdam\n
14,2022-09-17 16:06:00,Robert te Vaarwerk,Sorry man ik heb feestje van familie\n
15,2022-09-17 16:21:00,Bas hooge Venterink,Ai jammer\n
16,2022-09-17 16:22:00,Bas hooge Venterink,Andere gegadigden?\n


In [31]:
df.count()

timestamp    10127
author       10127
message      10127
dtype: int64

Clean tilde of author name

In [32]:
import re

clean_tilde = r"^~\u202f"
df["author"] = df["author"].apply(lambda x: re.sub(clean_tilde, "", x))

Check unique author names

In [33]:
print(df.author.unique())

['Bas hooge Venterink' 'Thomas Grundel' 'Robert te Vaarwerk' 'Weda'
 'Jop van der Woning' 'Smeerbeer van Dijk' 'Schjöpschen'
 'Thies Jan Weijmans' 'Spiderman Spin']


In [36]:
import numpy as np

df['living_in_city'] = np.where(
    (df['author'] == 'Bas hooge Venterink') |
    (df['author'] == 'Robert te Vaarwerk') |
    (df['author'] == 'Spiderman Spin') |
    (df['author'] == 'Thies Jan Weijmans') |
    (df['author'] == 'Smeerbeer van Dijk'),
    1,
    0
)

df.head()

Unnamed: 0,timestamp,author,message,living_in_city
12,2022-09-17 16:05:00,Bas hooge Venterink,Mensen die vnv nog willen chillen?\n,1
13,2022-09-17 16:06:00,Thomas Grundel,Ik ben nog in Haarlem/Amsterdam\n,0
14,2022-09-17 16:06:00,Robert te Vaarwerk,Sorry man ik heb feestje van familie\n,1
15,2022-09-17 16:21:00,Bas hooge Venterink,Ai jammer\n,1
16,2022-09-17 16:22:00,Bas hooge Venterink,Andere gegadigden?\n,1


Check the datatypes

In [37]:
df.dtypes

timestamp         datetime64[ns]
author                    object
message                   object
living_in_city             int64
dtype: object

Anonomize authors

In [38]:
import json
from wa_analyzer.humanhasher import humanize

authors = df.author.unique()
anon = {k: humanize(k) for k in authors}
# we save a reference file so we can look up the original author names if we want to
reference_file = processed / "anon_reference.json"

with open(reference_file, "w") as f:
    # invert the dictionary:
    ref = {v: k for k, v in anon.items()}
    # sort alphabetically:
    ref_sorted = {k: ref[k] for k in sorted(ref.keys())}
    # save as json:
    json.dump(ref_sorted, f)


if not len(anon) == len(authors):
    raise ValueError("you lost some authors!")

In [39]:
df["anon_author"] = df.author.map(anon)
df.head()

Unnamed: 0,timestamp,author,message,living_in_city,anon_author
12,2022-09-17 16:05:00,Bas hooge Venterink,Mensen die vnv nog willen chillen?\n,1,eye-catching-wolf
13,2022-09-17 16:06:00,Thomas Grundel,Ik ben nog in Haarlem/Amsterdam\n,0,hypnotic-rabbit
14,2022-09-17 16:06:00,Robert te Vaarwerk,Sorry man ik heb feestje van familie\n,1,nutty-chough
15,2022-09-17 16:21:00,Bas hooge Venterink,Ai jammer\n,1,eye-catching-wolf
16,2022-09-17 16:22:00,Bas hooge Venterink,Andere gegadigden?\n,1,eye-catching-wolf


Drop original author

In [40]:
df.drop(columns=["author"], inplace=True)

check if original author is gone

In [41]:
df.head()

Unnamed: 0,timestamp,message,living_in_city,anon_author
12,2022-09-17 16:05:00,Mensen die vnv nog willen chillen?\n,1,eye-catching-wolf
13,2022-09-17 16:06:00,Ik ben nog in Haarlem/Amsterdam\n,0,hypnotic-rabbit
14,2022-09-17 16:06:00,Sorry man ik heb feestje van familie\n,1,nutty-chough
15,2022-09-17 16:21:00,Ai jammer\n,1,eye-catching-wolf
16,2022-09-17 16:22:00,Andere gegadigden?\n,1,eye-catching-wolf


rename new author to author

In [42]:
df.rename(columns={"anon_author": "author"}, inplace=True)

Find emoji's and add that as feature

In [44]:
emoji_pattern = re.compile(
    "["
    "\U0001f600-\U0001f64f"  # emoticons
    "\U0001f300-\U0001f5ff"  # symbols & pictographs
    "\U0001f680-\U0001f6ff"  # transport & map symbols
    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
    "\U00002702-\U000027b0"  # Dingbats
    "\U000024c2-\U0001f251"
    "]+",
    flags=re.UNICODE,
)


def has_emoji(text):
    return bool(emoji_pattern.search(text))


df["has_emoji"] = df["message"].apply(has_emoji)

Create timestamp

In [45]:
import pytz

now = datetime.now(tz=pytz.timezone('Europe/Amsterdam')).strftime("%Y%m%d-%H%M%S")
now

'20250928-214244'

In [46]:
output = processed / f"whatsapp-{now}.csv"
output

PosixPath('../data/processed/whatsapp-20250928-214244.csv')

Save as parquet and as csv

In [47]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)