In [None]:

import pandas as pd
from sqlalchemy import create_engine, text

engine = create_engine(connection_string, connect_args={'sslmode':'require'})

query = f"""SELECT cc._id, cc.is_driver, ci.phone_trunc, cc.datetime, cc.duration, cc.operator_id, 
ST_AsText(cc.start_position) as start_wkt, ST_AsText(cc.end_position) as end_wkt, 
cc.operator_journey_id,
cc.distance,
ci.operator_user_id,
cc.end_position, gmap_url(cc.start_position, cc.end_position),
CASE WHEN pi.result >= 0 THEN pi.result ELSE 0 END as incentive,
cc.operator_trip_id,
 
cc2.is_driver as other_is_driver,
ci2.phone_trunc as other_phone_trunc
FROM CARPOOL.CARPOOLS cc
   join carpool.identities ci on cc.identity_id = ci._id
   join geo.perimeters gps on cc.start_geo_code = gps.arr and gps.year = 2022
   join geo.perimeters gpe on cc.end_geo_code = gpe.arr and gpe.year = 2022
   LEFT JOIN policy.incentives pi on pi.carpool_id = cc._id and pi.policy_id = 459

   JOIN CARPOOL.CARPOOLS AS CC2 ON CC.OPERATOR_JOURNEY_ID = CC2.OPERATOR_JOURNEY_ID and CC.is_driver != cc2.is_driver
   JOIN CARPOOL.IDENTITIES AS CI2 on CC2.IDENTITY_ID = CI2._id
WHERE CC.DATETIME >= '{start_date}'::timestamp AT TIME ZONE 'EUROPE/PARIS'
	AND CC.DATETIME < '{end_date}'::timestamp AT TIME ZONE 'EUROPE/PARIS'
    {f"and (gps.aom = '{aom_insee}' or gpe.aom = '{aom_insee}') and gps.year = 2022 and gpe.year = 2022" if aom_insee else ""}
"""

with engine.connect() as conn:
    df_carpool = pd.read_sql_query(text(query), conn)

In [None]:
grouped_tmp = df_carpool.groupby(['phone_trunc'])
phone_trunc_grouped_filtered = grouped_tmp.filter(lambda x: len(pd.unique(x['operator_id'])) > 1)

In [None]:
def has_overalp(row1, row2):
  start1 = pd.to_datetime(row1['datetime'])
  duration1 = pd.to_timedelta(row1['duration'], unit='s')
  
  start2 = row2['datetime']
  duration2 = pd.to_timedelta(row2['duration'], unit='s')

  end1 = start1 + duration1
  end2 = start2 + duration2

  overlap_start = max(start1, start2)
  overlap_end = min(end1, end2)
  return max(0, (overlap_end - overlap_start).total_seconds())

def group_by_overlap(df):
  for i in range(len(df)):
    if df.loc[df.index[i], 'overlap_group'] == 100 :
      for j in range(i+1, len(df)):
        overlap_duration = has_overalp(df.iloc[i], df.iloc[j])
        if overlap_duration > 20:
          df.loc[df.index[j], 'overlap_group'] = i
          df.loc[df.index[j], 'overlap_duration'] = overlap_duration
      df.loc[df.index[i], 'overlap_group'] = i
  return df

df_only_grouped_with_overlap_group_filled = phone_trunc_grouped_filtered.assign(overlap_group=100)
df_only_grouped_with_overlap_group_filled = df_only_grouped_with_overlap_group_filled.assign(overlap_duration=0)

grouped_tmp = df_only_grouped_with_overlap_group_filled.groupby(['phone_trunc'],group_keys=False)

df_only_grouped_with_overlap_group_filled = grouped_tmp.apply(lambda df: group_by_overlap(df)).reset_index(drop=True)

In [None]:
grouped_tmp = df_only_grouped_with_overlap_group_filled.groupby(['phone_trunc', 'overlap_group'],group_keys=False)
df_more_than_one_occ = grouped_tmp.filter(lambda x:  len(pd.unique(x['operator_id'])) > 1 and x['overlap_group'].count() > 1)

In [None]:
def remove_passenger_with_no_overlap(df):
     df_counted_other_phone = df.groupby(['other_phone_trunc']).count()
     df_kept_passenger_trunc = df_counted_other_phone[df_counted_other_phone._id > 1].reset_index()['other_phone_trunc']
     df_filtered = df[df['other_phone_trunc'].isin(df_kept_passenger_trunc)]
     return df_filtered

grouped_tmp = df_more_than_one_occ.groupby(['phone_trunc', 'overlap_group'],group_keys=False)

df_more_than_one_occ_enhanced = grouped_tmp.apply(lambda x: remove_passenger_with_no_overlap(x)).reset_index(drop=True)

In [None]:
def filter_carpool_with_overlap_on_same_operator(df):
    if not (len(df) > 1 and len(pd.unique(df['other_phone_trunc'])) == 1):
        return df
    else:
        min_overlap_value = df['overlap_duration'].min()
        min_valu_mask = df['overlap_duration'] = min_overlap_value
        carpool_to_remove = df[min_valu_mask]
        df = df.drop(carpool_to_remove.index)
        return df

grouped_tmp = df_more_than_one_occ_enhanced.groupby(['phone_trunc', 'overlap_group', 'operator_id', 'other_phone_trunc'])

df_without_overlap_on_same_operator = grouped_tmp.apply(lambda x: filter_carpool_with_overlap_on_same_operator(x))

In [None]:
grouped_tmp = df_more_than_one_occ_enhanced.groupby(['phone_trunc', 'overlap_group'])
df_more_than_one_occ_2 = grouped_tmp.filter(lambda x:  len(pd.unique(x['operator_id'])) > 1 and x['overlap_group'].count() > 1)

In [None]:
driver_mask = df_more_than_one_occ_2.is_driver == True 

grouped_tmp = df_more_than_one_occ_2[driver_mask].groupby(['phone_trunc', 'overlap_group'], group_keys=False)

df_with_authorized_multiop_driver = grouped_tmp.filter(lambda x: (\
    len(pd.unique(x['other_phone_trunc'])) == len(x) and \
    len(pd.unique(x['operator_id'])) == len(x) and \
    len(pd.unique(x['other_phone_trunc'])) > 1 and \
    len(pd.unique(x['operator_id'])) > 1))

df_final_result = df_more_than_one_occ_2.loc[~df_more_than_one_occ_2._id.isin(df_with_authorized_multiop_driver._id.unique())]


In [None]:
#Â is_driver_mask = df_final_result.is_driver == True

grouped_tmp = df_final_result.groupby(['phone_trunc', 'other_phone_trunc', 'overlap_group'])

control_matrix = grouped_tmp.agg(unique_operator_count=('operator_id', 'nunique'), journey_id_list=('operator_journey_id', list)).reset_index()

assert (control_matrix['unique_operator_count'] > 1).all()

In [None]:
update_sql_query = f"""UPDATE carpool.carpools SET status = fraudcheck_error where _id = ANY('{{{','.join(map(str,df_final_result['_id'].to_list()))}}}')
"""

with engine.begin() as conn:
    conn.execute(update_sql_query)


In [None]:
df_labels = pd.DataFrame(df_final_result['_id'])
df_labels.columns = ['carpool_id']
df_labels = df_labels.assign(label='interoperator_fraud')
df_labels = df_labels.assign(label=aom_insee)

In [None]:
from sqlalchemy.dialects.postgresql import insert

def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
    insert_stmt = insert(table.table).values(list(data_iter))
    on_duplicate_key_stmt = insert_stmt.on_conflict_do_nothing(index_elements=['carpool_id', 'label'])
    conn.execute(on_duplicate_key_stmt)

df_labels.to_sql(
    name="labels",
    schema="fraudcheck",
    con=engine,
    if_exists="append",
    index=False,
    method=insert_or_do_nothing_on_conflict
)