# Parameters
- `connection_string` : 'postgresql://postgres:postgres@localhost:5432/local'   -> Postgresql URL connection string
- `delay` :                48                                                   -> end_date is 48 hours from today
- `frame`:                 6                                                    -> start_date is 48 + 6 hours from today
- `update_carpool_status`: 'True'                                               -> If carpools should be updated or not

In [None]:
import datetime
import os
from datetime import timedelta
from datetime import datetime

import pandas as pd
from sqlalchemy import create_engine, text
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import insert

# Input params checks
update_carpool_status = os.environ['UPDATE_CARPOOL_STATUS'] == "true" or False
connection_string = os.environ['PG_CONNECTION_STRING']
delay = os.environ['DELAY']
frame = os.environ['FRAME'] 

start_date_frame = datetime.now() - timedelta(hours=int(delay)) - timedelta(hours=int(frame))
end_date_frame = datetime.now() - timedelta(hours=int(frame))
print(f"processing carpools between {start_date_frame} and {end_date_frame} ")

In [None]:
engine = create_engine(connection_string, connect_args={'sslmode':'require'})

query = f"""(
  SELECT
    CC._ID,
    CASE
      WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)
      ELSE CC.DRIVER_PHONE_TRUNC
    END AS PHONE_TRUNC,
    CC.DRIVER_IDENTITY_KEY AS IDENTITY_KEY,
    CC.DRIVER_OPERATOR_USER_ID AS OPERATOR_USER_ID,
    CC.START_DATETIME,
    EXTRACT(
      EPOCH
      FROM
        (CC.END_DATETIME - CC.START_DATETIME)
    )::INT AS DURATION,
    CC.END_DATETIME,
    CC.OPERATOR_ID,
    TRUE AS IS_DRIVER,
    CASE
      WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)
      ELSE CC.PASSENGER_PHONE_TRUNC
    END AS OTHER_PHONE_TRUNC,
    CC.PASSENGER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,
    CC.PASSENGER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID
  FROM
    CARPOOL_V2.CARPOOLS CC
  WHERE
    CC.START_DATETIME >=  NOW() - '{delay} hours'::interval - '{frame} hours'::interval
    AND CC.START_DATETIME < NOW() - '{delay} hours'::interval
    AND CC.OPERATOR_ID != 11
)
UNION ALL
(
  SELECT
    CC._ID,
    CASE
      WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)
      ELSE CC.PASSENGER_PHONE_TRUNC
    END AS PHONE_TRUNC,
    CC.PASSENGER_IDENTITY_KEY AS IDENTITY_KEY,
    CC.PASSENGER_OPERATOR_USER_ID AS OPERATOR_USER_ID,
    CC.START_DATETIME,
    EXTRACT(
      EPOCH
      FROM
        (CC.END_DATETIME - CC.START_DATETIME)
    )::INT AS DURATION,
    CC.END_DATETIME,
    CC.OPERATOR_ID,
    FALSE::BOOLEAN AS IS_DRIVER,
    CASE
      WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)
      ELSE CC.DRIVER_PHONE_TRUNC
    END AS OTHER_PHONE_TRUNC,
    CC.DRIVER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,
    CC.DRIVER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID
  FROM
    CARPOOL_V2.CARPOOLS CC
  WHERE
    CC.START_DATETIME >=  NOW() - '{delay} hours'::interval - '{frame} hours'::interval
    AND CC.START_DATETIME < NOW() - '{delay} hours'::interval
    AND CC.OPERATOR_ID != 11
)
"""

with engine.connect() as conn:
    df_carpool = pd.read_sql_query(text(query), conn)

# Etape 1 

Suppression des trajets dont l'`identity_key` n'apprait pas sur plusieurs opérateur différents.
Permet de faire un tri simple sur le multi-opérateur avant d'ajouter les clées de regroupements 

In [None]:
grouped_idkey_tmp = df_carpool.groupby(['identity_key', 'other_identity_key'])
df_multi_op_sorted = grouped_idkey_tmp.filter(lambda x: len(pd.unique(x['operator_id'])) > 1).sort_values(by=['identity_key', 'start_datetime'])

# Etape 2

Ajout d'une colonne `overlap_group` permettant d'identifier les chevauchements temporels des trajets pour une `identity_key`

In [None]:
df_multi_op_sorted['time_to_previous_end'] = None

grouped_tmp = df_multi_op_sorted.groupby(['identity_key', 'other_identity_key'],group_keys=False)

def stard_end_diff(df):
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            df.loc[df.index[j], 'time_to_previous_end'] = abs(df.loc[df.index[i], 'end_datetime'] - df.loc[df.index[j], 'start_datetime'])
    return df

df_with_diffs = grouped_tmp.apply(lambda df: stard_end_diff(df),  include_groups=True).reset_index(drop=True)

In [None]:
# df_with_overlap['time_diff_in_second'] = df_with_diffs['time_to_previous_end'].dt.total_seconds() / 60
df_with_overlap_selected = df_with_diffs[['operator_id','identity_key', 'other_identity_key', 'start_datetime', 'end_datetime', 'time_to_previous_end']]

In [None]:
df_with_diffs['time_to_previous_end'] = pd.to_timedelta(df_with_diffs['time_to_previous_end'])

In [None]:
df_too_close_trip_with_na = df_with_diffs[(df_with_diffs['time_to_previous_end'] <= timedelta(minutes=30))]

df_too_close_trip = df_too_close_trip_with_na.groupby(['identity_key', 'other_identity_key']).filter(lambda x: len(x) > 1)

In [None]:
df_too_close_trip_selected = df_too_close_trip[['operator_id','identity_key', 'other_identity_key', 'start_datetime', 'end_datetime', 'time_to_previous_end']]

In [None]:
df_final_result = df_too_close_trip[df_too_close_trip['time_to_previous_end'] > timedelta(minutes=0)]

Mise à jour des carpools retenus en status `fraudcheck_error`

In [None]:

# Update de carpool_v2 schema for failed status (i.e fraud detected)
if update_carpool_status is True:

    metadata = sa.MetaData(schema='carpool_v2')
    metadata.reflect(bind=engine)

    table = metadata.tables['carpool_v2.status']
    
    where_clause = table.c.carpool_id.in_(df_final_result['_id'].to_list())

    update_stmt = sa.update(table).where(where_clause).values(fraud_status='failed')

    with engine.connect() as conn:
        result = conn.execute(update_stmt)
        print(f"{result.rowcount} carpools status updated to fraud_status=failed")
        conn.commit()

Ajout des labels dans une table.
C'est cette table qui est utilisée pour renvoyer l'information du type de fraude aux opérateurs

In [None]:
df_labels = pd.DataFrame(df_final_result['_id'])
df_labels.columns = ['carpool_id']
df_labels = df_labels.assign(label='interoperator_too_close_trips')

In [None]:
def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
    insert_stmt = insert(table.table).values(list(data_iter))
    on_duplicate_key_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['carpool_id', 'label'])
    conn.execute(on_duplicate_key_stmt)

df_labels.to_sql(
    name="labels",
    schema="fraudcheck",
    con=engine,
    if_exists="append",
    index=False,
    method=insert_or_do_nothing_on_conflict
)