In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
from archaeo_super_prompt.dataset.normalization.intervention_date import utils, transforms

In [None]:
df = pd.read_csv(Path("~/Documents/Mappa/ressources/interventi_thomas.csv"),
                 sep=";")

In [None]:
gettable_data = df[["idscheda", "data_protocollo", "data_intervento", "anno"]].astype({"anno": pd.Int32Dtype()})
gettable_data = gettable_data[
    (gettable_data["data_protocollo"].notnull()) &
    (gettable_data["data_intervento"].notnull())
    & (gettable_data["anno"].notnull())
    ]
len(gettable_data)

In [None]:
normalized = utils.pipe(gettable_data, (
    transforms.generic_period,
    transforms.generic_single_period,
    transforms.precised_numeric_start_date,
    transforms.before_day_month,
))
is_date_processed = normalized["norm_date"].apply(
    lambda s: s is not None
)
print("covered_dates:", sum(is_date_processed)/len(is_date_processed)*100, "%")
normalized[~is_date_processed]

In [None]:
for i, d in enumerate(
    normalized[~is_date_processed]["data_intervento"].unique()
):
    print(d)

### Pre-normalized date to digital datetime

After this normalization, the column `norm_date` is a tuple with the following elements:
- a date string or a the `<UNKNOWN>` string when we cannot give a starting date to the intervention
- a date string for the most recent date before which the intervention could have started
- a precision string with the value `day`, `month`, `year`

The date string has thist format for now: `d/m/y`, with `d` a one-or-two-digits integer for the day, `y` a 4-digit integer for the year and `m` a string for the month to be normalized into a digit between 1 and 12.

**The aim of this section is to convert the date strings into a processable datetime object.**

In [None]:
from datetime import date as ddate
from archaeo_super_prompt.dataset.normalization.intervention_date import month_normalization
from logging import warning

bad_words = set()

def to_datetime(date):
    if date == "<UNKNOWN>":
        return None
    d, m, y = date.split("/")
    m = month_normalization.to_int_month(m)
    return ddate(int(y), m, int(d))

normalized[is_date_processed].assign(
    start_date=lambda df: pd.to_datetime(df["norm_date"].apply(lambda nd: to_datetime(nd.start_date))),
    end_date=lambda df: pd.to_datetime(df["norm_date"].apply(lambda nd: to_datetime(nd.end_date))),
    precision=lambda df: df["norm_date"].apply(lambda nd: nd.precision)
)