# Parameters
- `connection_string` : 'postgresql://postgres:postgres@localhost:5432/local'   -> Postgresql URL connection string

In [None]:
import os

import pandas as pd
from sqlalchemy import create_engine, text

df_carpools = pd.read_csv('impacted_carpools.csv')
connection_string = os.environ['PG_CONNECTION_STRING']

In [None]:
df_combined = pd.concat([df_carpools['_id'], df_carpools['suspicious_carpool_id']], ignore_index=True).drop_duplicates()
id_values = ', '.join(map(str, df_combined))

In [None]:
engine = create_engine(connection_string, connect_args={'sslmode':'require'})

query = f"""(
 SELECT
    CC._ID,
    CASE
      WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)
      ELSE CC.DRIVER_PHONE_TRUNC
    END AS PHONE_TRUNC,
    CC.DRIVER_IDENTITY_KEY AS IDENTITY_KEY,
    CC.DRIVER_OPERATOR_USER_ID AS OPERATOR_USER_ID,
    CC.OPERATOR_TRIP_ID,
    CC.START_DATETIME,
    EXTRACT(
      EPOCH
      FROM
        (CC.END_DATETIME - CC.START_DATETIME)
    )::INT AS DURATION,
    CC.END_DATETIME,
    CC.OPERATOR_ID,
    TRUE AS IS_DRIVER,
    CASE
      WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)
      ELSE CC.PASSENGER_PHONE_TRUNC
    END AS OTHER_PHONE_TRUNC,
    CC.PASSENGER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,
    CC.PASSENGER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID
  FROM
    CARPOOL_V2.CARPOOLS CC
  WHERE
    CC._id IN ({id_values})
)
UNION ALL
(
  SELECT
    CC._ID,
    CASE
      WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)
      ELSE CC.PASSENGER_PHONE_TRUNC
    END AS PHONE_TRUNC,
    CC.PASSENGER_IDENTITY_KEY AS IDENTITY_KEY,
    CC.OPERATOR_TRIP_ID,
    CC.PASSENGER_OPERATOR_USER_ID AS OPERATOR_USER_ID,
    CC.START_DATETIME,
    EXTRACT(
      EPOCH
      FROM
        (CC.END_DATETIME - CC.START_DATETIME)
    )::INT AS DURATION,
    CC.END_DATETIME,
    CC.OPERATOR_ID,
    FALSE::BOOLEAN AS IS_DRIVER,
    CASE
      WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)
      ELSE CC.DRIVER_PHONE_TRUNC
    END AS OTHER_PHONE_TRUNC,
    CC.DRIVER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,
    CC.DRIVER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID
  FROM
    CARPOOL_V2.CARPOOLS CC
  WHERE
    CC._id IN ({id_values})
)
"""

with engine.connect() as conn:
    df_carpool = pd.read_sql_query(text(query), conn)

In [None]:
df_carpool['date'] = df_carpool['start_datetime'].dt.date

In [None]:
df_carpool_agg = df_carpool.groupby(['identity_key', 'date']).agg(
    unique_operator_trip_id=('operator_trip_id', 'nunique'),
    unique_operator_id=('operator_id', 'nunique'),
    carpool_id_list=('_id', list)).reset_index()

In [None]:
multi_op_mask = df_carpool_agg['unique_operator_id'] > 1
more_than_four_trip_mask = df_carpool_agg['unique_operator_trip_id'] > 4

df_carpool_target = df_carpool_agg[(multi_op_mask) & (more_than_four_trip_mask)]

In [None]:
df_carpool_target = df_carpool_target.explode('carpool_id_list').rename(columns={'carpool_id_list': '_id'})

In [None]:
df_carpool_target_merged = pd.merge(df_carpool_target, df_carpool, on=['_id', 'identity_key'], how = 'left').sort_values(by=['identity_key', 'start_datetime'])

In [None]:
def get_every_trip_after_4_trip_id_occ(df):
   df_grouped_by_trip = df.groupby('operator_trip_id').agg(
    start_datetime=('start_datetime', 'first'),
    _id=('_id', 'first')
).reset_index().sort_values(by=['start_datetime'])
   return df_grouped_by_trip.iloc[4:]

df_final_result = df_carpool_target_merged.groupby(['identity_key']).apply(lambda x: get_every_trip_after_4_trip_id_occ(x))

In [None]:
df_wrong_flagued_id = df_carpools.copy()
df_wrong_flagued_id.drop_duplicates(subset='suspicious_carpool_id', inplace=True)
df_wrong_flagued_id.drop(columns=['_id'], inplace=True)
df_wrong_flagued_id.rename(columns={'suspicious_carpool_id': '_id'}, inplace=True)

In [None]:
df_final_result_id = df_final_result.copy() 
df_final_result_id

In [None]:
df_wrong_flagued_verified = df_wrong_flagued_id[~df_wrong_flagued_id['_id'].isin(df_final_result_id['_id'])]

In [None]:
engine = create_engine(connection_string, connect_args={'sslmode':'require'})

query = f"""(
 SELECT
    CC._ID,
    CC.operator_journey_id,
    CC.operator_id,
    FL.*
  FROM
    CARPOOL_V2.CARPOOLS CC
    JOIN
    fraudcheck.labels fl on fl.carpool_id = CC._id 
  WHERE
    CC._id IN ({', '.join(map(str, df_wrong_flagued_verified['_id']))}) and fl.label = 'interoperator_too_many_trips_by_day'
)
"""

with engine.connect() as conn:
    df_unwanted_flagued_carpools = pd.read_sql_query(text(query), conn)

In [None]:
df_unwanted_flagued_carpools.to_csv('unwanted_flagued_carpools.csv', index=False)