# APDF Checker

In [None]:
# TODO get campaign data from the API

from datetime import datetime, timedelta

campaign = {
    "_id": 1017,
    "name": "PDLL 2024",
    "start_date": "2024-01-01",
    "end_date": "2026-01-01",
    "max_amount": 4_400_000,
    "operators": {
      "3": "Klaxit",
      "4": "Karos",
      "9": "BlablacarDaily",
      "14": "Mobicoop",
    },
    "slices": [
      [5_000, 17_000],
      [17_000, 30_000],
      [30_000]
    ]
}

period = "2024-01"
[year, month] = period.split("-")

def last_day_of_month(year, month):
    return 31 if month == 12 else (datetime(year, month + 1, 1) - timedelta(days=1)).day

last_day = last_day_of_month(int(year), int(month))


In [None]:
import glob
import pandas as pd

# Specify the folder path where the XLSX files are located
folder_path = 'sources/'

# Specify the file pattern to match the month
file_pattern = f'APDF-{period}*.xlsx'

# Find the matching files
matching_files = glob.glob(folder_path + file_pattern)

stats = pd.DataFrame(columns=[
  "start_first",
  "start_last",
  "sum_all",
  "sum_sub",
  "count_all",
  "count_sub",
  "count_unsub",
], index=campaign["operators"].values())

for file in matching_files:
    print(f"Loading {file}...")
    
    [f_year, f_month, f_campaign_id, f_operator, f_trips_sub, f_trips_unsub, f_money, f_name] = file.replace(folder_path, '').replace('APDF-', '').replace('.xlsx', '').split('-')
    f_year = int(f_year)
    f_month = int(f_month)
    f_campaign_id = int(f_campaign_id)
    f_operator = int(f_operator)
    f_trips_sub = int(f_trips_sub)
    f_trips_unsub = int(f_trips_unsub)
    f_money = int(f_money)

    operator = campaign["operators"][str(f_operator)]

    # read the data
    colnames = ["journey_id",	"start_datetime",	"end_datetime",	"rpc_incentive",	"start_location",	"start_insee",	"end_location",	"end_insee",	"duration",	"distance",	"operator",	"operator_class",	"trip_id",	"operator_trip_id",	"driver_uuid",	"operator_driver_id",	"passenger_uuid",	"operator_passenger_id",	"incentive_type",	"start_epci_name",	"start_epci",	"end_epci_name",	"end_epci"]
    df = pd.read_excel(file, sheet_name=1, names=colnames, usecols="B,D,J", skiprows=1, nrows=100000, engine='openpyxl', dtype={'start_datetime': 'str', 'rpc_incentive': 'float', 'distance': 'int'})

    # date and time    
    stats.loc[operator, "start_first"] = df["start_datetime"].min()
    stats.loc[operator, "start_last"] = df["start_datetime"].max()
    stats.loc[operator, "first_on_first_day"] = stats.loc[operator, "start_first"][8:10] == "01"
    stats.loc[operator, "last_on_last_day"] = stats.loc[operator, "start_last"][8:10] == str(last_day)

    # trips
    stats.loc[operator, "sum_all"] = df["rpc_incentive"].sum()
    stats.loc[operator, "sum_sub"] = df.loc[df["rpc_incentive"] > 0, "rpc_incentive"].sum()
    stats.loc[operator, "count_all"] = df["rpc_incentive"].count()
    stats.loc[operator, "count_sub"] = df.loc[df["rpc_incentive"] > 0, "rpc_incentive"].count()
    stats.loc[operator, "count_unsub"] = df.loc[df["rpc_incentive"] == 0, "rpc_incentive"].count()

    # TODO handle slices

stats