In [23]:
import gc
import ast
import logging
from sys import stderr

import numpy as np
import dask_cudf as dcd
import pandas as pd

from utils import *

## Joining

In [25]:
SEP = ','
BUCKET_NAME = 'otg-prod-fraud-data'

CUTOFF_START_DATE = "2023-05-01"
CUTOFF_END_DATE = "2023-05-03"

In [5]:
def non_nan_unique(row):
    non_nan_values = [value for value in row if pd.notna(value) and value != '']
    unique_non_nan_values = set(non_nan_values)
    
    if len(unique_non_nan_values) == 0:
        return np.nan
    elif len(unique_non_nan_values) == 1:
        return unique_non_nan_values.pop()
    elif len(unique_non_nan_values) == 2 and unique_non_nan_values == {0, 1}:
        return 1
    else:
        raise ValueError("Logic Error: More than one unique non-NaN value")
    
def sum_elements(lst):
    filtered_lst = [abs(x) for x in lst if not pd.isna(x)]
    return float(sum(filtered_lst)) if filtered_lst else np.nan

def max_element(lst):
    cleaned_list = [abs(float(x)) for x in lst if not pd.isna(x)]
    return float(max(cleaned_list)) if cleaned_list else np.nan

### Order and Payment Merge

In [6]:
# Order details
order_details = dcd.concat(
    [
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/may-sep.parquet",        
            sep=SEP,
            parse_dates=date_cols,
        ),
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/oct-nov.parquet",
            sep=SEP,
            parse_dates=date_cols,
        ),
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/dec-jan.parquet",
            sep=SEP,
            parse_dates=date_cols,
        ),
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/feb-april.parquet",
            sep=SEP,
            parse_dates=date_cols,
        ),
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/may-june.parquet",
            sep=SEP,
            parse_dates=date_cols,
        ),
        dcd.read_csv(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/oct24.csv",
            sep=SEP,
            parse_dates=date_cols,
        )
    ]
).loc[lambda df: (df["bus_date"] < CUTOFF_END_DATE) & (df["bus_date"] >= CUTOFF_START_DATE)].drop_duplicates()

order_details = order_details.drop(order_details.columns.difference(data_cols), axis='columns').compute()
order_details["reduction_cd"] = order_details["reduction_cd"].replace(to_replace=reduction_cds)

gc.collect()

9393

In [None]:
# dtypes = order_details.dtypes.to_dict()
# # dtypes['birth_date'] = 'object'
# # dtypes['cust_reduction_cd'] = 'object'
# # dtypes['scheduled_local_dtm']= 'object'

# del order_details
# gc.collect()

211

In [None]:
# order_details = dcd.read_csv('s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/nov24.csv',
#                              sep = '\t',
#                              dtype = dtypes)

# order_details = order_details.drop(order_details.columns.difference(data_cols), axis='columns').compute()
# order_details["reduction_cd"] = order_details["reduction_cd"].replace(to_replace=reduction_cds)

# gc.collect()

1910

In [7]:
#payment details
payment_details = dcd.concat(
    [
        dcd.read_parquet(
            f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/may-sep.parquet",
            sep=SEP,
            parse_dates=date_cols,
        ),
        # dcd.read_parquet(
        #     f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/oct-nov.parquet",
        #     sep=SEP,
        #     parse_dates=date_cols,
        # ),
        # dcd.read_parquet(
        #     f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/dec-jan.parquet",
        #     sep=SEP,
        #     parse_dates=date_cols,
        # ),
        # dcd.read_parquet(
        #     f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/feb-april.parquet",
        #     sep=SEP,
        #     parse_dates=date_cols,
        # ),
        # dcd.read_parquet(
        #     f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/may-june.parquet",
        #     sep=SEP,
        #     parse_dates=date_cols,
        # ),
    ]
).loc[lambda df: (df["bus_date"] < CUTOFF_END_DATE) & (df["bus_date"] >= CUTOFF_START_DATE)].drop_duplicates()

payment_details = payment_details.drop(payment_details.columns.difference(data_cols), axis='columns').compute()
payment_details["card_type"] = payment_details["card_type"].replace(to_replace=card_types)
payment_details["payment_type"] = payment_details["payment_type"].replace(to_replace=payment_types)

gc.collect()

407

In [None]:
# dtypes = payment_details.dtypes.to_dict()

# del payment_details
# gc.collect()

47

In [None]:
# payment_details = dcd.read_csv('s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/nov24.csv',
#                                sep = '\t',
#                                dtype = dtypes)

# payment_details = payment_details.drop(payment_details.columns.difference(data_cols), axis='columns').compute()
# payment_details["card_type"] = payment_details["card_type"].replace(to_replace=card_types)
# payment_details["payment_type"] = payment_details["payment_type"].replace(to_replace=payment_types)

# gc.collect()

488

In [11]:
print(order_details.order_id.nunique())
print(payment_details.order_id.nunique())
print(order_details.shape)
print(payment_details.shape)
print(order_details.bus_date.min(), order_details.bus_date.max())
print(payment_details.bus_date.min(), payment_details.bus_date.max())

2362908
2362928
(2415918, 39)
(2381021, 13)
2024-11-01T00:00:00.000000000 2024-11-30T00:00:00.000000000
2024-11-01T00:00:00.000000000 2024-11-30T00:00:00.000000000


In [13]:
print(order_details.order_id.isin(payment_details.order_id).value_counts())
print(payment_details.order_id.isin(order_details.order_id).value_counts())

True    2415918
Name: order_id, dtype: int64
True     2381001
False         20
Name: order_id, dtype: int64


In [14]:
order_details.columns.intersection(payment_details.columns)

Index(['bus_date', 'pos_venue_id', 'order_id', 'vendor_id'], dtype='object')

In [8]:
op = (
    order_details
    .drop(
        [
            "pos_venue_id",
            "vendor_id",
        ],
        axis=1,
    )
    .merge(
        payment_details.drop(
            [
                "bus_date",
            ],
            axis=1,
        ),
        on="order_id",
        how="left",
    )
)

del order_details, payment_details
gc.collect()

0

### Feature Engineering

In [16]:
#MultipleVoucherMarker
op["voucher_count"] = op.groupby('voucher_number')['device_order_id'].transform("nunique").fillna(0).astype(int)
op["FE_multiple_voucher_fl"] = (op["voucher_count"] > 1).astype(int)

min_bus_date_mask = (
    op.groupby('voucher_number')['bus_date'].transform("min")
    == op['bus_date']
    )

op["FE_multiple_voucher_fl"] &= ~min_bus_date_mask.astype(int)

del min_bus_date_mask
gc.collect()

29

In [17]:
#Unique Counts
main_columns=["voucher_number", "order_id"]
uids=["device_order_id", "cust_prof_id"]
aggregations=["nunique"]

for main_column in main_columns:
    for col in uids:
        for agg in aggregations:
            new_col = "FE_" + col + "_" + main_column + "_ct_AG"
            uniques_df = op.groupby(col)[main_column].agg([agg])[agg]
            op[new_col] = op[col].map(uniques_df).astype('Float64')
            
del main_column, uids, aggregations
gc.collect()

0

In [12]:
# op.groupby('order_id')['FE_multiple_voucher_fl'].nunique().unique()
op.groupby('order_id')['FE_device_order_id_voucher_number_ct_AG'].nunique().unique()
op.groupby('order_id')['FE_cust_prof_id_voucher_number_ct_AG'].nunique().unique()
op.groupby('order_id')['FE_device_order_id_order_id_ct_AG'].nunique().unique()
op.groupby('order_id')['FE_cust_prof_id_order_id_ct_AG'].nunique().unique()

0    0
1    1
Name: FE_cust_prof_id_order_id_ct_AG, dtype: int32

In [10]:
print(op.shape)
print(op.bus_date.min(), op.bus_date.max())
op.head()

(89390, 48)
2023-05-01T00:00:00.000000000 2023-05-02T00:00:00.000000000


Unnamed: 0,bus_date,order_id,device_order_id,time_zone,order_local_time,sales_hr,ord_beg_time,ord_close_time,item_count,gross_total,...,card_type,payment_type,vendor_id,transaction_seq_nu,cash_recycler_tiny_code,payment_amount,account_id,payment_amt_rewards_points,voucher_number,emp_email
0,2023-05-01,124483513,230501011000170,America/New_York,2023-05-01 08:00:08,8,2023-05-01 12:00:08,2023-05-01 12:00:08,4,22.29,...,VISA,CARD,11104,1,,28.28,,,,
1,2023-05-01,124483517,230501001000163,America/New_York,2023-05-01 08:00:58,8,2023-05-01 12:00:58,2023-05-01 12:00:58,6,39.84,...,,APPLE_PAY,11318,1,,,,,,
2,2023-05-01,124483525,A0000728916829424499884,America/New_York,2023-05-01 08:01:32,8,2023-05-01 12:01:32,2023-05-01 12:01:32,1,6.15,...,VISA,CARD,11431,1,,6.15,,,,
3,2023-05-01,124483527,230501006000168,America/New_York,2023-05-01 07:59:28,7,2023-05-01 11:59:28,2023-05-01 11:59:28,2,25.0,...,VISA,CARD,11209,1,,31.5,,,,
4,2023-05-01,124483533,A361-247016829423830570,America/New_York,2023-05-01 08:01:43,8,2023-05-01 12:01:43,2023-05-01 12:01:43,2,7.0,...,MASTERCARD,CARD,1814,1,,13.06,,,,


### New aggregate joining implementation

In [None]:
%%time

aggregation_dict = {fea: list for fea in op.columns}

aop = op.groupby('order_id').agg(aggregation_dict)
aop = aop.drop(columns='order_id').reset_index()

del op, aggregation_dict
gc.collect()

CPU times: user 110 ms, sys: 0 ns, total: 110 ms
Wall time: 114 ms


157

In [20]:
gc.collect()

0

In [14]:
aop = aop.to_pandas()

In [None]:
sum_cols = ['gross_total', 'net_total', 'taxes', 'reduction_amt', 'fee_amt', 'tip_amount', 'item_count', 'payment_amount', 'voucher_count']
watch_cols = ['order_id', 'card_type', 'payment_type', 'transaction_seq_nu', 'voucher_number', 'menu_vendor_id']

for col in sum_cols:
    aop[col] = aop[col].apply(sum_elements)

for col in aop.columns: 
    if col not in sum_cols + watch_cols:
        aop[col] = aop[col].apply(non_nan_unique)
        
aop['transaction_seq_nu'] = aop['transaction_seq_nu'].apply(max_element)


In [23]:
def count_payment_types(payment_list):
    counts = {}
    if payment_list is None:
        return counts
    for payment in payment_list:
        if payment is not None:
            counts[payment] = counts.get(payment, 0) + 1
        if payment is None:
            counts['None'] = counts.get('None', 0) + 1
    return counts

payment_counts = aop['payment_type'].apply(count_payment_types)
card_counts = aop['card_type'].apply(count_payment_types)

payment_counts_df = pd.DataFrame(list(payment_counts)).fillna(0).astype(int)
card_counts_df = pd.DataFrame(list(card_counts)).fillna(0).astype(int)

payment_counts_df = payment_counts_df.rename(lambda x: 'payment_' + x.lower(), axis=1)
card_counts_df = card_counts_df.rename(lambda x: 'card_' + x.lower(), axis=1)

new_aop = pd.concat([aop, payment_counts_df, card_counts_df], axis=1)

del payment_counts, card_counts, payment_counts_df, card_counts_df, aop
gc.collect()

0

In [None]:
print(new_aop)
print(new_aop.bus_date.min(), new_aop.bus_date.max())
new_aop.head()

(2362908, 77)
2024-11-01 00:00:00 2024-11-30 00:00:00


Unnamed: 0,order_id,bus_date,device_order_id,time_zone,order_local_time,sales_hr,ord_beg_time,ord_close_time,item_count,gross_total,...,card_mastercard,card_discover,card_none,card_voucher,card_credit,card_jcb,card_cup,card_checkout,card_up,card_paypal
0,15949263101,2024-11-12,240826007000822,America/New_York,2024-11-12 08:20:25,18,2024-11-12 13:20:25,2024-11-12 13:20:25,2.0,12.0,...,0,0,0,0,0,0,0,0,0,0
1,16131696201,2024-11-24,240919025000761,America/New_York,2024-11-24 09:37:10,15,2024-11-24 14:37:10,2024-11-24 14:37:10,2.0,13.0,...,0,0,0,0,0,0,0,0,0,0
2,16189478228347189,2024-11-07,240926090000130,America/Chicago,2024-11-07 14:41:52,13,2024-11-07 20:41:52,2024-11-07 20:41:52,2.0,22.99,...,0,0,0,0,0,0,0,0,0,0
3,16243485101,2024-11-19,241003003000248,America/New_York,2024-11-19 10:41:07,9,2024-11-19 15:41:07,2024-11-19 15:41:07,1.0,18.0,...,0,0,0,0,0,0,0,0,0,0
4,16247091201,2024-11-22,241003024000874,America/Chicago,2024-11-22 09:33:21,15,2024-11-22 15:33:21,2024-11-22 15:33:21,2.0,57.99,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# new_op = new_aop.fillna(0)

# print(((new_op[abs
#          (
#              abs(new_op['net_total']) + abs(new_op['taxes']) + abs(new_op['fee_amt']) - abs(new_op['payment_amount']) + abs(new_op['tip_amount'])
#              ) > 0.1
#          ]).shape[0] / (new_op.shape[0]) ) * 100 )

# print(((new_op[abs
#          (
#              abs(new_op['net_total']) - abs(new_op['gross_total']) + abs(new_op['reduction_amt'])
#              ) > 0.2
#          ]).shape[0] / (new_op.shape[0])) * 100 )

# del new_op
# gc.collect()


In [24]:
# print(new_aop.reduction_cd.unique())
# new_aop['card_type'].unique()

### Label Join

#### May 2023 - Jan 2024

In [25]:
dtypes = {"device_order_id": "object",
          "is_fraud": "int",
          "order_id": "object",}

doid_label = dcd.read_csv(
                f"s3://otg-prod-fraud-data/labels/final/may_jan_labels_final.csv",
                dtype=dtypes,
            ).drop(columns = {"Unnamed: 0"}).drop_duplicates().compute().to_pandas().reset_index(drop = True)

oid_label1 = dcd.read_csv(
                f"s3://otg-prod-fraud-data/labels/final/sept_labels_order_id.csv",
                dtype=dtypes,
            ).drop_duplicates().compute().to_pandas().reset_index(drop = True)

oid_label2 = dcd.read_csv(
                f"s3://otg-prod-fraud-data/labels/final/oct_jan_labels/oct_jan_labels_order_id.csv",
                dtype=dtypes,
            ).drop_duplicates().compute().to_pandas().reset_index(drop = True)

doid_label.shape, oid_label1.shape, oid_label2.shape

((18489, 2), (523, 2), (547, 2))

In [26]:
# doid_label['device_order_id'].isin(new_aop['order_id']).value_counts()
# doid_label['device_order_id'].isin(new_aop['device_order_id']).value_counts()

# oid_label1['order_id'].isin(new_aop['order_id']).value_counts()
# oid_label1['order_id'].isin(new_aop['device_order_id']).value_counts()

# oid_label2['order_id'].isin(new_aop['order_id']).value_counts()
# oid_label2['order_id'].isin(new_aop['device_order_id']).value_counts()


In [27]:
opl = new_aop.merge(
    doid_label.drop_duplicates(), on="device_order_id", how="left"
).fillna({'is_fraud': 0}).rename(columns={'is_fraud': 'is_fraud_x'}).merge(
    oid_label1.drop_duplicates(), on="order_id", how="left"
).fillna({'is_fraud': 0}).rename(columns={'is_fraud': 'is_fraud_y'}).merge(
    oid_label2.drop_duplicates(), on="order_id", how="left"
).fillna({'is_fraud': 0}).rename(columns={'is_fraud': 'is_fraud_z'})

opl['is_fraud'] = opl[['is_fraud_x', 'is_fraud_y', 'is_fraud_z']].any(axis=1).astype(int)
opl = opl.drop(columns = {"is_fraud_x", "is_fraud_y", "is_fraud_z"})

del new_aop, doid_label, oid_label1, oid_label2
gc.collect()

57

In [28]:
opl[opl['is_fraud'] == 1].shape[0]

3257

In [29]:
opl.to_parquet(
    f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/experiment_join/join_data/mayjune_fin23.parquet",
    index=False
    )

#### Feb 2024 - June 2024

In [22]:
%%time

label_dtype={
    "Appetize OrderId": "object",
             "Flo OrderID": "object"
             }

satish_fraud = dcd.read_csv(
    's3://otg-prod-fraud-data/labels/2024_label/feb_jun_labels.csv', 
    dtype = label_dtype
).drop(columns = ['Observation Remark', 'Audit', 'Order Dt']).drop_duplicates().compute().to_pandas()

satish_fraud = satish_fraud.dropna(subset=['Flo OrderID', 'Appetize OrderId'])

print(satish_fraud.shape)
satish_fraud.head()

(21171, 3)
CPU times: user 60.9 ms, sys: 9.39 ms, total: 70.3 ms
Wall time: 344 ms


Unnamed: 0,Month,Flo OrderID,Appetize OrderId
0,Feb-24,A0000394617077566012556,42838061
3,Feb-24,A0000828217081100930318,43134669
6,Feb-24,A0000644917090757972363,43967642
9,Feb-24,A0000644917090766142364,43968056
12,Feb-24,A0000705417067952952103,42075960


In [23]:
# satish_fraud['Flo OrderID'].isin(op['order_id']).value_counts()
# satish_fraud['Flo OrderID'].isin(op['device_order_id']).value_counts()

# satish_fraud['Appetize OrderId'].isin(op['order_id']).value_counts()
# satish_fraud['Appetize OrderId'].isin(op['device_order_id']).value_counts()

In [24]:
new_aop['is_fraud_a'] = new_aop['order_id'].isin(satish_fraud['Flo OrderID'])
new_aop['is_fraud_b'] = new_aop['order_id'].isin(satish_fraud['Appetize OrderId'])

new_aop['is_fraud_c'] = new_aop['device_order_id'].isin(satish_fraud['Flo OrderID'])
new_aop['is_fraud_d'] = new_aop['device_order_id'].isin(satish_fraud['Appetize OrderId'])

new_aop['is_fraud'] = new_aop[['is_fraud_a', 'is_fraud_b', 'is_fraud_c', 'is_fraud_d']].any(axis=1).astype(int)
new_aop = new_aop.drop(columns = {"is_fraud_a", "is_fraud_b",'is_fraud_c', 'is_fraud_d'})

In [25]:
new_aop[new_aop['is_fraud'] == 1].shape[0]

2400

In [None]:
# new_aop.to_parquet(
#     f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/experiment_join/join_data/june_24.parquet",
#     index=False
#     )

#### Feb 24 to Sep 24

In [26]:
%%time

label_dtype={
    "Appetize OrderId": "object",
             "Flo OrderID": "object"
             }

satish_fraud = dcd.read_csv(
    's3://otg-prod-fraud-data/prod_data/labels/feb24-sep24.csv', 
    dtype = label_dtype
).drop(columns = ['Observation Remark', 'Audit', 'Order Dt']).drop_duplicates().compute().to_pandas()

satish_fraud = satish_fraud.dropna(subset=['Flo OrderID', 'Appetize OrderId'])

print(satish_fraud.shape)
satish_fraud.head()

(31172, 4)
CPU times: user 107 ms, sys: 0 ns, total: 107 ms
Wall time: 484 ms


Unnamed: 0,Month,Unique,Flo OrderID,Appetize OrderId
0,Feb-24,1,240201001000502,42093287
1,Feb-24,1,240201001001279,42130254
2,Feb-24,1,240201002000061,42065395
3,Feb-24,1,240201002000484,42094835
4,Feb-24,1,240201002000880,42117981


In [27]:
new_aop['is_fraud_a'] = new_aop['order_id'].isin(satish_fraud['Flo OrderID'])
new_aop['is_fraud_b'] = new_aop['order_id'].isin(satish_fraud['Appetize OrderId'])

new_aop['is_fraud_c'] = new_aop['device_order_id'].isin(satish_fraud['Flo OrderID'])
new_aop['is_fraud_d'] = new_aop['device_order_id'].isin(satish_fraud['Appetize OrderId'])

new_aop['is_fraud'] = new_aop[['is_fraud_a', 'is_fraud_b', 'is_fraud_c', 'is_fraud_d']].any(axis=1).astype(int)
new_aop = new_aop.drop(columns = {"is_fraud_a", "is_fraud_b",'is_fraud_c', 'is_fraud_d'})

In [28]:
new_aop[new_aop['is_fraud'] == 1].shape[0]


0

In [30]:
new_aop.bus_date.min(), new_aop.bus_date.max(), 

(Timestamp('2024-11-01 00:00:00'), Timestamp('2024-11-30 00:00:00'))

In [31]:
new_aop.to_parquet(
    f"s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/experiment_join/join_data/kgk_24.parquet",
    index=False
    )

## Train

In [62]:
import xgboost as xgb
import sklearn
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector,
    make_column_transformer,
)
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin

import feature_engineering as fe

In [26]:
logging.basicConfig(stream=stderr, level="INFO")
sklearn.set_config(enable_metadata_routing=True)

In [64]:
print(opl.shape)
print(opl.bus_date.min(), opl.bus_date.max())
opl.head()

(1864132, 73)
2023-08-01 00:00:00 2023-08-31 00:00:00


Unnamed: 0,order_id,bus_date,device_order_id,time_zone,order_local_time,sales_hr,ord_beg_time,ord_close_time,item_count,gross_total,...,payment_gift_voucher,card_none,card_visa,card_mastercard,card_american express,card_discover,card_credit,card_voucher,card_jcb,is_fraud
0,129215229,2023-08-01,A0000304516908747337249,America/New_York,2023-08-01 03:26:23,3,2023-08-01 07:26:23,2023-08-01 07:26:23,2.0,25.98,...,0,1,0,0,0,0,0,0,0,0
1,129215230,2023-08-01,A0000130716908686011801,America/New_York,2023-08-01 03:29:04,3,2023-08-01 07:29:04,2023-08-01 07:29:04,3.0,20.63,...,0,1,0,0,0,0,0,0,0,0
2,129215231,2023-08-01,A0000095116908750927953,America/New_York,2023-08-01 03:32:05,3,2023-08-01 07:32:05,2023-08-01 07:32:05,1.0,5.99,...,0,1,0,0,0,0,0,0,0,0
3,129215232,2023-08-01,A0000095116908751337955,America/New_York,2023-08-01 03:32:35,3,2023-08-01 07:32:35,2023-08-01 07:32:35,1.0,5.99,...,0,1,0,0,0,0,0,0,0,0
4,129215233,2023-08-01,A0000304516908750737251,America/New_York,2023-08-01 03:33:09,3,2023-08-01 07:33:09,2023-08-01 07:33:09,5.0,35.47,...,0,1,0,0,0,0,0,0,0,0


In [None]:
# opl['menu_vendor_id'] = opl['menu_vendor_id'].apply(lambda x: x[0])
opl['payment_amt_rewards_points'] = opl['payment_amt_rewards_points'].astype('Float64')

opl.drop(columns = ['payment_type', 'card_type'], inplace= True)

opl['FE_refund_without_reference'] = (
    (opl['payment_amount'] < 0) & 
    (opl['original_order_id'].isna())
).astype(int)

opl['FE_incomplete_refund'] = (
    (opl['payment_amount'] < 0) & 
    (opl['order_status_cd'].isna())
).astype(int)

opl['FE_original_missing'] = ~opl['original_order_id'].isin(opl['order_id']) & opl['original_order_id'].notna()
opl['FE_original_missing'] = opl['FE_original_missing'].astype(int)

opl[opl.select_dtypes("object").columns] = opl.select_dtypes("object").where(
                opl.select_dtypes("object").notna(), None
            )

In [66]:
opl.bus_date.min(), opl.bus_date.max()

(Timestamp('2023-08-01 00:00:00'), Timestamp('2023-08-31 00:00:00'))

In [67]:
train = opl.loc[lambda df: df['bus_date'].lt('2023-08-01')]
test = opl.loc[lambda df: df['bus_date'].ge('2023-08-01')]

print(train.shape, test.shape)

del opl
gc.collect()

(0, 75) (1864132, 75)


0

In [68]:
TARGET_COL = 'is_fraud'

X_train_full, X_test, y_train_full, y_test = (
    train.drop(columns=TARGET_COL),
    test.drop(columns=TARGET_COL),
    train[TARGET_COL],
    test[TARGET_COL],
)

del train, test
gc.collect()

0

In [34]:
X_train_full.shape, y_train_full.shape

((3264038, 77), (3264038,))

In [70]:
X_test.shape, y_test.shape

((1864132, 74), (1864132,))

In [35]:
drop_features = [
    "account_id",
    "birth_date",
    "bus_date",
    "cash_recycler_tiny_code",
    "company_id",
    "cust_prof_id",
    "device_order_id",
    "exclusive_tax",
    "extra_ph_number",
    "inclusive_tax",
    "ord_beg_time",
    "ord_close_time",
    "ord_email",
    "ord_first_name",
    "ord_last_name",
    "ord_phonenumber",
    "order_id",
    "order_local_time",
    "order_tab_id",
    "original_order_id",
    "refund_vendor_id",
    "refund_venue_id",
    "src_sys_id",
    "transaction_id",
    "transaction_seq_nu",
    "united_account_id",
]

fe_features = pd.Index(
    (
        "activity_status_code",
        # "card_type",
        # "employee_role_id",
        "employee_role_name",
        "employer_id",
        "employer_name",
        "menu_vendor_id",
        "order_status_cd",
        # "payment_type",
        "pos_terminal_id",
        "pos_venue_id",
        "vendor_id",
        "vendor_loc_id",
        # "voucher_number",
    )
).difference(drop_features)

zero_features = X_train_full.columns[X_train_full.columns.str.contains(r"_(fl|flag)$")].union(
    (
        "item_count",
        "taxes",
        # "exclusive_tax",
        # "inclusive_tax",
        "reduction_amt",
        "fee_amt",
        "tip_amount",
        "payment_amt_rewards_points",
    )
)
num_features = X_train_full.select_dtypes(include=["int", "float"]).columns.difference(
    zero_features.union(["birth_date"])
)

for name, feats in zip(
    ("Drop", "Frequency", "Numeric", "Zero"),
    (drop_features, fe_features, num_features, zero_features),
):
    print(f"{name}:\n{feats}\n")

Drop:
['account_id', 'birth_date', 'bus_date', 'cash_recycler_tiny_code', 'company_id', 'cust_prof_id', 'device_order_id', 'exclusive_tax', 'extra_ph_number', 'inclusive_tax', 'ord_beg_time', 'ord_close_time', 'ord_email', 'ord_first_name', 'ord_last_name', 'ord_phonenumber', 'order_id', 'order_local_time', 'order_tab_id', 'original_order_id', 'refund_vendor_id', 'refund_venue_id', 'src_sys_id', 'transaction_id', 'transaction_seq_nu', 'united_account_id']

Frequency:
Index(['activity_status_code', 'employee_role_name', 'employer_id',
       'employer_name', 'menu_vendor_id', 'order_status_cd', 'pos_terminal_id',
       'pos_venue_id', 'vendor_id', 'vendor_loc_id'],
      dtype='object')

Numeric:
Index(['FE_cust_prof_id_order_id_ct_AG',
       'FE_cust_prof_id_voucher_number_ct_AG',
       'FE_device_order_id_order_id_ct_AG',
       'FE_device_order_id_voucher_number_ct_AG', 'FE_incomplete_refund',
       'FE_mismatch_refund', 'FE_original_missing',
       'FE_refund_without_reference'

  zero_features = X_train_full.columns[X_train_full.columns.str.contains(r"_(fl|flag)$")].union(


In [36]:
EVAL_METRIC = "aucpr"
HYPERPARAMS = {
    "max_depth": 25,
    "learning_rate": 0.025,  # eta -> learning_rate
    "eval_metric": EVAL_METRIC,
    "subsample": 0.65,
    "colsample_bytree": 0.7,
    "tree_method": "gpu_hist",
    "objective": "binary:logistic",
    "scale_pos_weight": 5,  # 0.1
    "n_estimators": 4 * 50000,
    "random_state": 500,
    "early_stopping_rounds": 500,
    "callbacks": [
        xgb.callback.EarlyStopping(
            rounds=500,
            metric_name=EVAL_METRIC,
            maximize=True,
            save_best=True,
        )
    ],
}

preprocess_pipeline = make_pipeline(
    (
        imputer := make_column_transformer(
            (
                SimpleImputer(
                    missing_values=None, strategy="constant", fill_value="missing"
                ),
                make_column_selector(dtype_include=("category", "object")),
            ),
            (SimpleImputer(strategy="constant", fill_value=0), zero_features),
            (SimpleImputer(strategy="median"), num_features),
            remainder="passthrough",
            verbose_feature_names_out=False,
        ).set_output(transform="pandas")
    ),
    fe.ComputeDateFeatures(),
    fe.CustomerDaywiseSubtotal(),
    fe.SubtotalOverFifty(),
    fe.RefundOverFifteenDays(),
    fe.RefundAgainstIncomplete(),
    # fe.MultipleVoucherMarker(),
    fe.TipGreaterThan50EmpMarker(),
    fe.TipGreaterThan50Marker(),
    fe.TipGreaterThan50VoucherMarker(),
    FunctionTransformer(func=fe.calculate_ratio_features),
    # Generic Feature Engineering
    fe.CombineFeatures(),
    fe.UniqueCounts(
        main_columns=["cust_prof_id"],
        uids=["bus_date"],
        aggregations=["count"],
    ),
    fe.Aggregation(),
    # fe.UniqueCounts(),
    fe.TimeblockFrequencyEncoder(
        ["bus_date", "day_of_week"], ["order_id", "cust_prof_id"]
    ),
    fe.TimeblockFrequencyEncoder(
        ["day_of_month", "day_of_week"], ["vendor_loc_id", "employee_role_name"]
    ),
    FunctionTransformer(func=fe.drop_columns, kw_args={"cols_to_drop": drop_features}),
    (
        cat_encoder := make_column_transformer(
            (
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
                lambda df: df.select_dtypes(["category", "object"]).columns,
            ),
            remainder="passthrough",
            verbose_feature_names_out=False,
        ).set_output(transform="pandas")
    ),
    fe.FrequencyEncoder(fe_features),
)

train_pipeline = make_pipeline(
    preprocess_pipeline,
    (
        xgb_clf := xgb.XGBClassifier(**HYPERPARAMS).set_fit_request(
            eval_set=True, verbose=True
        )
    ),
)

gc.collect()

0

In [37]:
%%time

X_t = preprocess_pipeline.fit_transform(X_train_full)

del X_train_full
gc.collect()

INFO:root:ComputeDateFeatures
INFO:root:CustomerDaywiseSubtotal
INFO:root:SubtotalOverFifty
INFO:root:RefundOverFifteenDays
INFO:root:RefundAgainstIncomplete
INFO:root:TipGreaterThan50EmpMarker
INFO:root:TipGreaterThan50Marker
INFO:root:TipGreaterThan50VoucherMarker
INFO:root:calculate_ratio_features
INFO:root:CombineFeatures
INFO:root:UniqueCounts
INFO:root:Aggregation
INFO:root:TimeblockFrequencyEncoder
INFO:root:TimeblockFrequencyEncoder
INFO:root:drop_columns
INFO:root:FreqEncoder - Fit
INFO:root:FreqEncoder - Transform


CPU times: user 2min 55s, sys: 21.9 s, total: 3min 17s
Wall time: 3min 17s


8

In [38]:
X_train, X_val, y_train, y_val = train_test_split(
    X_t, y_train_full, test_size=0.1, stratify=y_train_full, random_state=42
)

del X_t, y_train_full
gc.collect()

0

In [39]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((2937634, 111), (2937634,), (326404, 111), (326404,))

In [40]:
xgb_clf.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
)

[0]	validation_0-aucpr:0.78465
[1]	validation_0-aucpr:0.81456
[2]	validation_0-aucpr:0.83413
[3]	validation_0-aucpr:0.84230
[4]	validation_0-aucpr:0.84996
[5]	validation_0-aucpr:0.85423
[6]	validation_0-aucpr:0.85494
[7]	validation_0-aucpr:0.85587
[8]	validation_0-aucpr:0.85650
[9]	validation_0-aucpr:0.85902
[10]	validation_0-aucpr:0.86073
[11]	validation_0-aucpr:0.86071
[12]	validation_0-aucpr:0.86179
[13]	validation_0-aucpr:0.86391
[14]	validation_0-aucpr:0.86385
[15]	validation_0-aucpr:0.86473
[16]	validation_0-aucpr:0.86491
[17]	validation_0-aucpr:0.86637
[18]	validation_0-aucpr:0.86766
[19]	validation_0-aucpr:0.86735
[20]	validation_0-aucpr:0.86946
[21]	validation_0-aucpr:0.86917
[22]	validation_0-aucpr:0.86890
[23]	validation_0-aucpr:0.86880
[24]	validation_0-aucpr:0.86896
[25]	validation_0-aucpr:0.86870
[26]	validation_0-aucpr:0.86877
[27]	validation_0-aucpr:0.86887
[28]	validation_0-aucpr:0.86895
[29]	validation_0-aucpr:0.86927
[30]	validation_0-aucpr:0.86920
[31]	validation_0-

In [41]:
val_pred_prob = xgb_clf.predict_proba(X_val)[:, 1]
val_aucpr = sklearn.metrics.average_precision_score(y_val, val_pred_prob)
val_aucpr

0.8986934137663368

In [42]:
del X_train, y_train, X_val, y_val
gc.collect()

0

In [72]:
X_test['card_cup'] = None
X_test['card_paypal'] = None
X_test['card_up'] = None

In [73]:
%%time

X_ts = preprocess_pipeline.transform(X_test)

del X_test
gc.collect()

INFO:root:ComputeDateFeatures
INFO:root:CustomerDaywiseSubtotal
INFO:root:SubtotalOverFifty
INFO:root:RefundOverFifteenDays
INFO:root:RefundAgainstIncomplete
INFO:root:TipGreaterThan50EmpMarker
INFO:root:TipGreaterThan50Marker
INFO:root:TipGreaterThan50VoucherMarker
INFO:root:calculate_ratio_features
INFO:root:CombineFeatures
INFO:root:UniqueCounts
INFO:root:Aggregation
INFO:root:TimeblockFrequencyEncoder
INFO:root:TimeblockFrequencyEncoder
INFO:root:drop_columns
INFO:root:FreqEncoder - Transform


CPU times: user 1min 34s, sys: 10 s, total: 1min 44s
Wall time: 1min 44s


0

In [74]:
X_ts.shape, y_test.shape

((1864132, 111), (1864132,))

In [75]:
test_pred_prob = xgb_clf.predict_proba(X_ts)[:, 1]
test_aucpr = sklearn.metrics.average_precision_score(y_test, test_pred_prob)
test_aucpr

0.7104448080284247

In [None]:
test_pred_prob = xgb_clf.predict_proba(X_ts)[:, 1]
test_aucpr = sklearn.metrics.average_precision_score(y_test, test_pred_prob)
test_aucpr

0.853812600777446

In [None]:
threshold = 0.1
y_pred = np.where(test_pred_prob >= threshold, 1, 0)
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm.ravel()
print(f"TN: {TN}\nFP: {FP}\nFN: {FN}\nTP: {TP}")

TN: 38697
FP: 112
FN: 4
TP: 36
