In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine, text

connection_string = os.environ['PG_CONNECTION_STRING']

engine = create_engine(connection_string, connect_args={'sslmode':'require'})

query = f"""SELECT
    CC._ID,
  CC.DATETIME,
  CC.OPERATOR_TRIP_ID,
  CC.OPERATOR_JOURNEY_ID,
  CC.OPERATOR_ID,
  CC.IS_DRIVER,
  CI.OPERATOR_USER_ID
FROM
  CARPOOL.CARPOOLS CC
  JOIN CARPOOL.IDENTITIES CI ON CC.IDENTITY_ID=CI._ID
  LEFT JOIN GEO.PERIMETERS GPS
   ON CC.START_GEO_CODE = GPS.ARR
   AND GPS.YEAR = 2023
  LEFT JOIN GEO.PERIMETERS GPE
    ON CC.END_GEO_CODE = GPE.ARR
   AND GPE.YEAR = 2023
   WHERE (GPS.AOM = '287500078' OR GPE.AOM = '287500078' OR GPE.AOM = '217500016' OR GPS.AOM = '217500016') and cc.status = 'ok'
  ORDER BY cc.DATETIME DESC
LIMIT
  300000;
"""

with engine.connect() as conn:
    df_carpool = pd.read_sql_query(text(query), conn)

In [None]:
df_carpool['day_month'] = df_carpool['datetime'].astype(str).str[:10]

In [None]:
def filter_func(x):
    return x

df_carpool_with_agg = df_carpool.groupby(['day_month', 'operator_user_id']).agg({
    'operator_journey_id': ['count', list],
    'is_driver': [list, lambda x: x.nunique(), 'count'],
})

In [None]:
df_carpool_filtered = df_carpool_with_agg[(df_carpool_with_agg['operator_journey_id']['count'] > 1) 
                                          & (df_carpool_with_agg['is_driver']['<lambda_0>'] > 1)
                                          & (df_carpool_with_agg['is_driver']['count'] > 2)]



In [None]:
df_carpool_filtered