Time series data

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config

{'directories': {'raw': 'data/raw',
  'processed': 'data/processed',
  'input': '_chat.txt',
  'current': 'whatsapp-20251013-212815.parq',
  'preprocess_csv': 'whatsapp-20251018-161422-preprocess.csv',
  'preprocess_parq': 'whatsapp-20251018-161422-preprocess.parq',
  'cleaned_csv': 'whatsapp-20251018-161422-cleaned.csv',
  'cleaned_parq': 'whatsapp-20251018-161422-cleaned.parq',
  'feature_engineered_csv': 'whatsapp-20251018-161422-feature.csv',
  'feature_engineered_parq': 'whatsapp-20251018-161422-feature.parq',
  'categories_plot_png': 'categories_plot.png',
  'distribution_plot_png': 'distribution_plot.png',
  'correlation_plot_png': 'correlation_plot.png',
  'datetime_format': '%d-%m-%Y %H:%M',
  'drop_authors': []}}

In [3]:
root = Path("..").resolve()
# Access 'processed' inside the 'directories' key
processed = root / Path(config["directories"]["processed"]) 
datafile = processed / config["directories"]["feature_engineered_parq"]

if not datafile.exists():
    logger.warning(
        f"{datafile} does not exist. First run src/preprocess.py, and check the timestamp!"
    )

In [4]:
df = pd.read_parquet(datafile)
df.dtypes

timestamp                datetime64[ns, UTC]
message                               object
living_in_city                         int64
tech_background                        int64
author                                object
year                                   int32
month                                  int32
week                                  UInt32
day                                    int32
hour                                   int32
minute                                 int32
day_of_week                           object
is_weekend                             int64
word_count                             int64
react_time_sec                       float64
react_time_sec_plus_1                float64
react_time_sec_log                   float64
react_time_min                       float64
react_time_min_plus_1                float64
react_time_min_log                   float64
react_time_hr                        float64
react_time_hr_plus_1                 float64
react_time

Number of texts over time

In [10]:
texts_per_author_per_year = df.groupby(['year', 'author']).size().reset_index(name='text_count')

presentation_table = texts_per_author_per_year.pivot(
    index='author',
    columns='year',
    values='text_count'
).fillna(0)

print(presentation_table)

year                  2022  2023  2024  2025
author                                      
brisk-sheep            222   626   585   194
crystalline-uakari     135   367   409   239
effervescent-penguin   121   413   439   222
eye-catching-wolf      107   369   248   113
hypnotic-rabbit         81   173   182    65
nutty-chough           149   517   200   196
riotous-dingo          145   490   388   159
spattered-duck          49   545   445   262
translucent-dog        192   455   431   203
