### Import necessary libraries

In [1]:
import gc
import logging
from sys import stderr
import dask_cudf as dcd
import pandas as pd
from utils import data_cols, date_cols, reduction_cds, card_types, payment_types, dtypes
import pickle
import numpy as np

logging.basicConfig(stream=stderr, level="INFO")

### Loading Data

Loading November Views for Joining

In [2]:
%%time

order_details = pd.read_parquet(
        "s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/order_details/may-june.parquet",
        # dtype=dtype,
        # parse_dates=["bus_date", "birth_date"],
    ).sort_values(by="bus_date").reset_index(drop=True)

order_details = order_details.drop(
    order_details.columns.difference(data_cols), axis="columns"
)
order_details["reduction_cd"] = order_details["reduction_cd"].replace(
    to_replace=reduction_cds
)

gc.collect()

INFO:aiobotocore.credentials:Found credentials from IAM Role: S3_Access


CPU times: user 54.9 s, sys: 8.76 s, total: 1min 3s
Wall time: 1min 13s


12

In [3]:
%%time

%%time

payment_details = pd.read_parquet(
        "s3://otg-prod-fraud-data/prod_data/updated_unhashed_joins/payment_details/may-june.parquet",
        # dtype=dtype,
        # parse_dates=["bus_date", "birth_date"],
    ).sort_values(by="bus_date").reset_index(drop=True)

payment_details = payment_details.drop(
    payment_details.columns.difference(data_cols), axis="columns"
)

payment_details["card_type"] = payment_details["card_type"].replace(
    to_replace=card_types
)
payment_details["payment_type"] = payment_details["payment_type"].replace(
    to_replace=payment_types
)
print(payment_details.shape)
print(payment_details.bus_date.min(), payment_details.bus_date.max())
gc.collect()

(5516291, 13)
2024-05-01 00:00:00 2024-06-30 00:00:00
CPU times: user 28.9 s, sys: 2.03 s, total: 30.9 s
Wall time: 30.7 s
CPU times: user 28.9 s, sys: 2.03 s, total: 30.9 s
Wall time: 30.7 s


49

In [4]:
print(order_details.order_id.nunique())
print(payment_details.order_id.nunique())
print(order_details.shape)
print(payment_details.shape)
print(order_details.bus_date.min(), order_details.bus_date.max())
print(payment_details.bus_date.min(), payment_details.bus_date.max())

5443872
5443872
(5566992, 37)
(5516291, 13)
2024-05-01 00:00:00 2024-06-30 00:00:00
2024-05-01 00:00:00 2024-06-30 00:00:00


In [5]:
order_details.columns.intersection(payment_details.columns)

Index(['bus_date', 'pos_venue_id', 'order_id', 'vendor_id'], dtype='object')

In [6]:
%%time

dataset = order_details.drop(["pos_venue_id", "vendor_id"], axis=1).merge(
    payment_details.drop(["bus_date"], axis=1), on=["order_id"], how="left"
)

del order_details, payment_details
gc.collect()

CPU times: user 23 s, sys: 1.56 s, total: 24.5 s
Wall time: 24.5 s


0

### Feature Engineering

Voucher Counts

In [7]:
%%time

#MultipleVoucherMarker
dataset["voucher_count"] = dataset.groupby('voucher_number')['device_order_id'].transform("nunique").fillna(0).astype(int)
dataset["FE_multiple_voucher_fl"] = (dataset["voucher_count"] > 1).astype(int)

min_bus_date_mask = (
    dataset.groupby('voucher_number')['bus_date'].transform("min")
    == dataset['bus_date']
    )

dataset["FE_multiple_voucher_fl"] &= ~min_bus_date_mask.astype(int)

del min_bus_date_mask
gc.collect()

CPU times: user 3.32 s, sys: 0 ns, total: 3.32 s
Wall time: 3.32 s


0

Unique Counts

In [8]:
%%time
#Unique Counts
main_columns=["voucher_number", "order_id"]
uids=["device_order_id", "cust_prof_id"]
aggregations=["nunique"]

for main_column in main_columns:
    for col in uids:
        for agg in aggregations:
            new_col = "FE_" + col + "_" + main_column + "_ct_AG"
            uniques_df = dataset.groupby(col)[main_column].agg([agg])[agg]
            dataset[new_col] = dataset[col].map(uniques_df).astype('Float64')
            
del main_column, uids, aggregations, uniques_df, new_col
gc.collect()

CPU times: user 51.4 s, sys: 167 ms, total: 51.6 s
Wall time: 51.6 s


0

In [9]:
print(dataset.shape)
print(dataset.bus_date.min(), dataset.bus_date.max())
dataset.head()

(5642266, 52)
2024-05-01 00:00:00 2024-06-30 00:00:00


Unnamed: 0,bus_date,order_id,device_order_id,time_zone,order_local_time,sales_hr,ord_beg_time,ord_close_time,item_count,gross_total,...,account_id,payment_amt_rewards_points,voucher_number,emp_email,voucher_count,FE_multiple_voucher_fl,FE_device_order_id_voucher_number_ct_AG,FE_cust_prof_id_voucher_number_ct_AG,FE_device_order_id_order_id_ct_AG,FE_cust_prof_id_order_id_ct_AG
0,2024-05-01,14864239601,240425017000824,America/New_York,2024-04-25 16:30:16,16,2024-04-25 20:30:16,2024-04-25 20:30:16,3,-32.25,...,,,,,0,0,0.0,,1.0,
1,2024-05-01,149058443,A0000916917145627971504,America/New_York,2024-05-01 07:38:39,7,2024-05-01 11:38:39,2024-05-01 11:38:39,1,4.55,...,,,,,0,0,0.0,,1.0,
2,2024-05-01,149058438,240501009000119,America/New_York,2024-05-01 07:38:37,7,2024-05-01 11:38:37,2024-05-01 11:38:37,5,42.75,...,,,,,0,0,0.0,0.0,1.0,2.0
3,2024-05-01,149058431,240501026000109,America/New_York,2024-05-01 07:38:33,7,2024-05-01 11:38:33,2024-05-01 11:38:33,4,7.6,...,,,,,0,0,0.0,0.0,1.0,1.0
4,2024-05-01,149058431,240501026000109,America/New_York,2024-05-01 07:38:33,7,2024-05-01 11:38:33,2024-05-01 11:38:33,6,21.19,...,,,,,0,0,0.0,0.0,1.0,1.0


### Utility Functions

In [10]:
import numpy as np
def non_nan_unique(row):
    non_nan_values = [value for value in row if pd.notna(value) and value != '']
    unique_non_nan_values = set(non_nan_values)
    
    if len(unique_non_nan_values) == 0:
        return np.nan
    elif len(unique_non_nan_values) == 1:
        return unique_non_nan_values.pop()
    elif len(unique_non_nan_values) == 2 and unique_non_nan_values == {0, 1}:
        return 1
    else:
        raise ValueError("Logic Error: More than one unique non-NaN value")
    
def sum_elements(lst):
    filtered_lst = [abs(x) for x in lst if not pd.isna(x)]
    return float(sum(filtered_lst)) if filtered_lst else np.nan

def max_element(lst):
    cleaned_list = [abs(float(x)) for x in lst if not pd.isna(x)]
    return float(max(cleaned_list)) if cleaned_list else np.nan

### Aggregate Joining Implementation

In [11]:
%%time

aggregation_dict = {fea: list for fea in dataset.columns}
moids = dataset.order_id.value_counts()[dataset.order_id.value_counts()>1].index
dups_df = dataset.loc[dataset.order_id.isin(moids),:]
nondups_df = dataset.loc[~dataset.order_id.isin(moids),:]

CPU times: user 17.8 s, sys: 610 ms, total: 18.4 s
Wall time: 18.4 s


In [12]:
%%time

aop = dups_df.groupby('order_id').agg(aggregation_dict)
aop = aop.drop(columns='order_id').reset_index()

CPU times: user 4min 4s, sys: 684 ms, total: 4min 5s
Wall time: 4min 5s


In [13]:
%%time
sum_cols = ['gross_total', 'net_total', 'taxes', 'reduction_amt', 'fee_amt', 'tip_amount', 'item_count', 'payment_amount', 'voucher_count']
watch_cols = ['order_id', 'card_type', 'payment_type', 'transaction_seq_nu', 'voucher_number', 'menu_vendor_id']

for col in sum_cols:
    # print(col)
    aop[col] = aop[col].apply(sum_elements)

for col in aop.columns:
    if col not in sum_cols + watch_cols:
        aop[col] = aop[col].apply(non_nan_unique)

aop['transaction_seq_nu'] = aop['transaction_seq_nu'].apply(max_element)

CPU times: user 15.5 s, sys: 455 ms, total: 16 s
Wall time: 16 s


In [14]:
watch_cols = ['order_id', 'card_type', 'payment_type', 'transaction_seq_nu', 'voucher_number', 'menu_vendor_id']
watch_cols.remove('transaction_seq_nu')
watch_cols.remove('order_id')

In [15]:
for col in watch_cols:
    nondups_df.loc[:,col] = nondups_df[col].apply(lambda x:[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nondups_df.loc[:,col] = nondups_df[col].apply(lambda x:[x])
  nondups_df.loc[:,col] = nondups_df[col].apply(lambda x:[x])


In [18]:
%%time

dataset = (
    pd.concat([nondups_df, aop], axis=0)
    .sort_values(by="bus_date")
    .reset_index(drop=True)
)
print(aop.shape, dups_df.shape, nondups_df.shape, dataset.shape)
del aop, moids, dups_df, nondups_df, aggregation_dict
gc.collect()

(170299, 52) (368693, 52) (5273573, 52) (5443872, 52)
CPU times: user 10.6 s, sys: 2.87 s, total: 13.5 s
Wall time: 13.5 s


0

In [19]:
dataset['payment_type'][:]

0                [CARD]
1                [CARD]
2                [CARD]
3                [CARD]
4                [CARD]
               ...     
5443867     [APPLE_PAY]
5443868          [CARD]
5443869          [CARD]
5443870          [CARD]
5443871    [CARD, CARD]
Name: payment_type, Length: 5443872, dtype: object

In [20]:
def count_payment_types(payment_list):
    counts = {}
    if payment_list is None:
        return counts
    for payment in payment_list:
        if payment is not None:
            counts[payment] = counts.get(payment, 0) + 1
        if payment is None:
            counts["None"] = counts.get("None", 0) + 1
    return counts

dataset["payment_type"] = dataset["payment_type"].apply(lambda x: ["None" if item is pd.NA else item for item in x] if x else None)
dataset["card_type"] = dataset["card_type"].apply(lambda x: ["None" if item is pd.NA else item for item in x] if x else None)

payment_counts = dataset["payment_type"].apply(count_payment_types)
card_counts = dataset["card_type"].apply(count_payment_types)

payment_counts_df = pd.DataFrame(list(payment_counts)).fillna(0).astype(int)
card_counts_df = pd.DataFrame(list(card_counts)).fillna(0).astype(int)

In [21]:
payment_counts_df = payment_counts_df.rename(lambda x: 'payment_' + x.lower(), axis=1)
card_counts_df = card_counts_df.rename(lambda x: 'card_' + x.lower(), axis=1)

In [30]:
%%time

dataset = pd.concat([dataset, payment_counts_df, card_counts_df], axis=1)

del payment_counts, card_counts, payment_counts_df, card_counts_df
gc.collect()

CPU times: user 5.02 s, sys: 650 ms, total: 5.67 s
Wall time: 5.66 s


862

In [33]:
dataset.columns

Index(['bus_date', 'order_id', 'device_order_id', 'time_zone',
       'order_local_time', 'sales_hr', 'ord_beg_time', 'ord_close_time',
       'item_count', 'gross_total', 'net_total', 'taxes', 'reduction_amt',
       'cust_prof_id', 'mobile_ord_fl', 'original_order_id', 'vendor_loc_id',
       'fee_amt', 'tip_amount', 'sync_status', 'menu_vendor_id',
       'refund_vendor_id', 'refund_venue_id', 'pos_terminal_id',
       'reduction_cd', 'order_tab_id', 'order_status_cd', 'united_account_id',
       'cust_first_name', 'cust_last_name', 'activity_status_code',
       'employee_role_name', 'employer_id', 'employer_name',
       'default_email_address', 'pos_venue_id', 'card_type', 'payment_type',
       'vendor_id', 'transaction_seq_nu', 'cash_recycler_tiny_code',
       'payment_amount', 'account_id', 'payment_amt_rewards_points',
       'voucher_number', 'emp_email', 'voucher_count',
       'FE_multiple_voucher_fl', 'FE_device_order_id_voucher_number_ct_AG',
       'FE_cust_prof_id_vou

In [31]:
dataset.bus_date.dt.month.value_counts()

6    2728742
5    2715130
Name: bus_date, dtype: int64

In [34]:
dataset[dataset.bus_date.dt.month == 5].to_parquet(
    "s3://otg-prod-fraud-data/test_data/new/may24joined.parquet", index=False
)
dataset[dataset.bus_date.dt.month == 6].to_parquet(
    "s3://otg-prod-fraud-data/test_data/new/june24joined.parquet", index=False
)

In [24]:
del agg, col, main_columns, sum_cols, watch_cols
gc.collect()

0

### Inference part

In [35]:
dataset.shape

(5443872, 73)

In [5]:
%%time
dataset = dcd.concat(
    [
        dcd.read_parquet(
            "s3://otg-prod-fraud-data/test_data/oct24joined.parquet",
            sep=",",
            dtype=dtypes,
            date_cols=date_cols
            # parse_dates=date_cols,
        ),
        dcd.read_parquet(
            "s3://otg-prod-fraud-data/test_data/nov24joined.parquet",
            sep=",",
            dtype=dtypes,
            date_cols=date_cols
            # parse_dates=date_cols,
        ),
    ]
).compute()
dataset.shape

CPU times: user 2.08 s, sys: 627 ms, total: 2.71 s
Wall time: 3.83 s


(4873407, 72)

In [6]:
dataset.drop(columns=["card_type", "payment_type"], inplace=True)
dataset.bus_date.min(),dataset.bus_date.max()

(numpy.datetime64('2024-10-01T00:00:00.000000'),
 numpy.datetime64('2024-11-30T00:00:00.000000'))

In [None]:
# dataset = dataset[dataset.bus_date.ge("2024-05-15")]

In [7]:
dataset = dataset.to_pandas()

In [8]:
%%time
import numpy as np
dataset["voucher_number"] = dataset["voucher_number"].apply(
    lambda x: next((val for val in x if not pd.isna(val)), np.nan)
)
dataset["menu_vendor_id"] = dataset["menu_vendor_id"].apply(
    lambda x: next((val for val in x if not pd.isna(val)), np.nan)
)

CPU times: user 12 s, sys: 875 ms, total: 12.9 s
Wall time: 12.9 s


In [9]:
%%time

INFERENCE_PIPELINE = pickle.load(
    open(
        "/home/ec2-user/ai-studio-fraud-ml/aayush_exp/evaluation_notebook/New-Join/train_pipeline_new_join_data.pkl",
        "rb",
    )
)

gc.collect()

CPU times: user 24.3 s, sys: 2.2 s, total: 26.5 s
Wall time: 31.8 s


16

In [10]:
# del val
gc.collect()

0

In [11]:
%%time

dataset["FE_refund_without_reference"] = (
    (dataset["payment_amount"] < 0) & (dataset["original_order_id"].isna())
).astype(int)

dataset["FE_incomplete_refund"] = (
    (dataset["payment_amount"] < 0) & (dataset["order_status_cd"].isna())
).astype(int)

dataset["FE_original_missing"] = (
    ~dataset["original_order_id"].isin(dataset["order_id"])
    & dataset["original_order_id"].notna()
)
dataset["FE_original_missing"] = dataset["FE_original_missing"].astype(int)

dataset[dataset.select_dtypes("object").columns] = dataset.select_dtypes(
    "object"
).where(dataset.select_dtypes("object").notna(), None)

CPU times: user 38.4 s, sys: 9.67 s, total: 48.1 s
Wall time: 48 s


In [12]:
dataset.shape

(4873407, 73)

In [14]:
# from shap_pipeline import FraudDetectionModel

final_pipeline = FraudDetectionModel(INFERENCE_PIPELINE,0.05)

In [18]:
%%time

dataset["exclusive_tax"] = None
dataset["inclusive_tax"] = None
dataset['card_cash'] = None
dataset['card_glory'] = None
dataset['card_none'] = None
dataset['payment_none'] = None
dataset['card_other'] = None
dataset['card_visa credit'] = None
dataset['card_visa debit'] = None
dataset['payment_gift'] = None
dataset['card_paypal'] = None ### June
dataset = dataset.sort_values(by = 'bus_date').reset_index(drop=True)


CPU times: user 9.1 s, sys: 3.52 s, total: 12.6 s
Wall time: 12.6 s


In [18]:
import sklearn
sklearn.set_config(enable_metadata_routing=True)

In [10]:
dataset.shape
dataset[dataset.bus_date.dt.month==6].shape

(2728742, 75)

In [20]:
# dataset.index = dataset.index.astype('int')

In [20]:
%%time

full_data, predicted_data = final_pipeline.predict(
    dataset,
    dataset[dataset.bus_date.dt.month == 11].shape[0],
)

INFO:root:ComputeDateFeatures
INFO:root:CustomerDaywiseSubtotal
INFO:root:SubtotalOverFifty
INFO:root:RefundOverFifteenDays
INFO:root:RefundAgainstIncomplete
INFO:root:TipGreaterThan50EmpMarker
INFO:root:TipGreaterThan50Marker
INFO:root:TipGreaterThan50VoucherMarker
INFO:root:calculate_ratio_features
INFO:root:CombineFeatures
INFO:root:UniqueCounts
INFO:root:Aggregation
INFO:root:TimeblockFrequencyEncoder
INFO:root:TimeblockFrequencyEncoder
INFO:root:drop_columns
INFO:root:FreqEncoder - Transform
INFO:root:Prediction data shape:(2362908, 117)
INFO:root:Filter columns over 0.05 pred prob
INFO:root:SHAP explainer initialized
INFO:root:SHAP feature generation
  top_features = df.columns[
INFO:root:Sorting dataframes by probability
INFO:root:Sorting dataframes by probability
INFO:root:Computing Fraud Categories
INFO:root:Generating refined SHAP explanations
INFO:root:Finalizing SHAP explanations


CPU times: user 24min, sys: 57.1 s, total: 24min 57s
Wall time: 23min 15s


In [21]:
print(predicted_data.shape, full_data.shape)
# full_data.to_csv("s3://otg-prod-fraud-data/test_date/preds/june_preds_full.csv",index=False)
# predicted_data.to_csv("s3://otg-prod-fraud-data/test_date/preds/june_preds.csv",index=False)

(2987, 139) (2362908, 86)


In [20]:
predicted_data.to_csv('test.csv',index=False)

In [22]:
predicted_data.fraud_category.value_counts()

Reduction Fraud                   2170
Tips Fraud                         444
Voucher Fraud                      201
Uncategorized Fraud                145
Refund Fraud                        24
Refund Fraud / Reduction Fraud       2
Tips Fraud / Reduction Fraud         1
Name: fraud_category, dtype: int64

In [None]:
full_data.shape

### Loading Labels

In [23]:
labels = pd.read_csv(
    "s3://otg-prod-fraud-data/prod_data/labels/feb24-sep24.csv",
    dtype={
        "Flo OrderID": "object",
        "Appetize OrderId":"object"
    },
).rename(
    columns={
        "Order Dt": "bus_date",
        "Flo OrderID": "device_order_id",
        "Appetize OrderId": "order_id",
        "Observation Remark": "remarks",
    }
)
# labels.order_id = labels.order_id.astype("int")
labels["bus_date"] = pd.to_datetime(labels.bus_date)
# labels = labels['device_order_id']
labels = labels.drop_duplicates(subset=['device_order_id'])
print(labels.bus_date.dt.month.value_counts().sort_index())
june = labels[labels.Month=="Jun-24"].drop_duplicates('device_order_id')
june.bus_date.min(), june.bus_date.max()

1        42
2     10747
3      2415
4      2521
5      2684
6      2798
7      2107
8      2586
9      2352
10        1
12        7
Name: bus_date, dtype: int64


  labels["bus_date"] = pd.to_datetime(labels.bus_date)


(Timestamp('2024-05-29 00:00:00'), Timestamp('2024-06-30 00:00:00'))

In [24]:
FP = predicted_data.device_order_id.drop_duplicates()[
    ~predicted_data.device_order_id.drop_duplicates().isin(june["device_order_id"])
].shape[0]
TP = predicted_data.device_order_id.drop_duplicates()[
    predicted_data.device_order_id.drop_duplicates().isin(june["device_order_id"])
].shape[0]
FN = (~june['device_order_id'].isin(predicted_data.device_order_id.drop_duplicates())).sum()
print(TP, FP, FN)

2439 1090 380


In [25]:
%%time

thresholds = np.round(np.arange(0.05, 1, 0.01), 3)

cm_data = []


for t in thresholds:
    data = full_data[full_data.prediction_probabilities>=t].drop_duplicates('device_order_id')
    
    FP = data.device_order_id[~data.device_order_id.isin(june["device_order_id"])]
    TP = data.device_order_id[data.device_order_id.isin(june["device_order_id"])]
    FN = june.device_order_id[~june.device_order_id.isin(data["device_order_id"])]
    cm_data.append({
        'Threshold': t,
        'FP': FP.shape[0],
        'TP': TP.shape[0],
        'FN': FN.shape[0],
        # 'TP($)': data[data.device_order_id.isin(TP)].reduction_amt.abs().sum(),
        # 'FP($)': data[data.device_order_id.isin(FP)].reduction_amt.abs().sum(),
        # 'FN($)': full_data[full_data.prediction_probabilities<t][full_data[full_data.prediction_probabilities<t].device_order_id.isin(FN)].reduction_amt.abs().sum()
    })
    # print(t)

cm_data = pd.DataFrame(cm_data)
# cm_data.to_csv("s3://otg-prod-fraud-data/new_predictions/monthly_new_category/results/confusion_values/non_duplicated/updated/august24_threshold_results.csv",index=False)
# Display the result
cm_data


CPU times: user 649 ms, sys: 0 ns, total: 649 ms
Wall time: 648 ms


Unnamed: 0,Threshold,FP,TP,FN
0,0.05,1090,2439,380
1,0.06,920,2424,395
2,0.07,812,2414,405
3,0.08,726,2407,412
4,0.09,684,2392,427
...,...,...,...,...
90,0.95,5,1635,1184
91,0.96,3,1582,1237
92,0.97,0,1523,1296
93,0.98,0,1368,1451


In [13]:
import pandas as pd
import shap
import re
import numpy as np
import logging
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin


class FraudDetectionModel(BaseEstimator):
    def __init__(self, train_pipeline, THRESHOLD=0.05):
        pd.options.mode.chained_assignment = None

        self.pipeline = train_pipeline
        self.preprocess_pipeline = self.pipeline["pipeline"]
        self.model = self.pipeline["xgbclassifier"]
        self.explainer = shap.Explainer(self.model)
        self.THRESHOLD = THRESHOLD

    def _filter_columns_over_threshold(self, X):
        """
        Filter the dataframe based on a specified threshold and returns the slice with prediction probabilities greater than or equal to the specified threshold.

        Parameters:
        X (pd.DataFrame | cudf.DataFrame): Dataframe to filter.

        Returns:
        pd.DataFrame | cudf.DataFrame: A slice of the input DataFrame containing rows where prediction probabilities are greater than or equal to the specified threshold.
        """

        logging.log(
            level=logging.INFO, msg=f"Filter columns over {self.THRESHOLD} pred prob"
        )

        return X[X["prediction_probabilities"] >= self.THRESHOLD]

    def _shap_feature_generation(self, df, shap_values):
        """
        Determine SHAP features from given SHAP values.

        Parameters:
        df (pd.DataFrame): Dataframe from which SHAP values are to be determined.
        shap_values (numpy.ndarray): Matrix of SHAP values.

        Returns:
        pd.DataFrame: Dataframe with a column 'SHAP_features' containing list of 3 most important SHAP features for every transaction.
        """

        logging.log(level=logging.INFO, msg="SHAP feature generation")

        top_positive_shap_values = np.sort(shap_values)[
            :, -3:
        ]  # Determine the top 3 SHAP values
        top_features_indices = np.argsort(
            shap_values
        )[
            :, -3:
        ]  # Determine indices of the 3 most important SHAP features, and store them to top_feature_indices
        top_features = df.columns[
            top_features_indices
        ]  # Determine top 3 highest importance columns using the 'top_features_indices' list.

        important_features = [
            "P_Imp1",
            "P_Imp2",
            "P_Imp3",
        ]
        # Columns containing names of the 3 important features

        new_data = pd.DataFrame(top_features, columns=important_features)

        df = pd.concat(
            [df.reset_index(drop=True), new_data.reset_index(drop=True)], axis=1
        )

        df["SHAP_features"] = df[important_features].apply(
            lambda row: " + ".join(sorted(row)), axis=1
        )

        df.drop(columns=important_features, inplace=True)
        return df

    def _sort_by_probability(self, df):
        """
        Sort dataframe by prediction probability

        Parameters:
        df (pd.DataFrame): Dataframe which needs to be sorted by prediction probability

        Returns:
        pd.DataFrame: sorted dataframe.
        """
        logging.log(level=logging.INFO, msg="Sorting dataframes by probability")
        return df.sort_values("prediction_probabilities", ascending=False).reset_index(
            drop=True
        )

    def _shap_explanation(self, shap_data_detailed):
        logging.log(level=logging.INFO, msg="Generating refined SHAP explanations")
        # If id features needs to be converted into integer
        non_exp_columns = [
            x
            for x in shap_data_detailed.columns
            if x.endswith("_ratio") or x.endswith("_ct_AG")
        ]
        list_to_integer = ["day_of_month", "day_of_week", "voucher_count"]

        shap_data_detailed[list_to_integer] = (
            shap_data_detailed[list_to_integer]
            .apply(pd.to_numeric, errors="coerce")
            .astype(pd.Int64Dtype())
        )

        # Rounding off the amount related columns to 2 digit places
        round_columns = [
            "cust_daywise_subtotal",
            "gross_total",
            "net_total",
            "taxes",
            "exclusive_tax",
            "inclusive_tax",
            "reduction_amt",
            "fee_amt",
            "tip_amount",
            "payment_amount",
        ]

        shap_data_detailed[round_columns] = shap_data_detailed[round_columns].round(2)

        # Replacing flag features with True and False value
        flag_features = [
            "FE_subtotal_over_50_fl",
            "FE_refund_over_15_days_fl",
            "FE_refund_against_incomplete_fl",
            "FE_multiple_voucher_fl",
            "FE_tip_gt_50_fl",
            "FE_tip_gt_50_voucher_fl",
            "FE_tip_gt_50_emp_fl",
        ]
        for feature in flag_features:
            shap_data_detailed[feature] = shap_data_detailed[feature].replace(
                {1: True, -1: False, 0: False}
            )

        shap_data_detailed["shap_detailed_data"] = shap_data_detailed[
            "SHAP_features"
        ].apply(lambda x: [i for i in x.split(" + ")])

        def shap_helper(x):
            values = []
            for features in x:
                for y in shap_data_detailed.columns:
                    if (
                        features not in non_exp_columns
                        and pd.notna(features)
                        and re.search(re.escape(y), features)
                    ):
                        values.append(y)
                    elif features in non_exp_columns:
                        if features in values:
                            continue
                        values.append(features)
                    else:
                        continue
            return values

        shap_data_detailed["shap_detailed_data"] = shap_data_detailed[
            "shap_detailed_data"
        ].apply(shap_helper)

        shap_data_detailed["shap_detailed_data"] = shap_data_detailed[
            "shap_detailed_data"
        ].apply(
            lambda features: [
                item
                for item in features
                if not item.endswith(("CB", "FE", "id_AG", "FE_subtotal_over_50_fl"))
                # if not item.endswith(("CB", "FE", "AG", "ratio", "order_id"))
            ]
            if features
            else []
        )

        # Adding cust_prof_id in the beginning if it is present in the list else not adding.
        shap_data_detailed["shap_detailed_data_filtered"] = shap_data_detailed[
            "shap_detailed_data"
        ].apply(
            lambda features: (
                ", ".join(
                    ["cust_prof_id"]
                    + sorted(set(item.strip() for item in features) - {"cust_prof_id"})
                )
                if features and "cust_prof_id" in features
                else ", ".join(set(item.strip() for item in features))
                if features
                else np.nan
            )
        )

        # Adding gross_total and net_total if reduction_amt is present
        shap_data_detailed["shap_detailed_data_filtered"] = shap_data_detailed[
            "shap_detailed_data_filtered"
        ].apply(
            lambda features: features + ", gross_total, net_total"
            if pd.notna(features)
            and "reduction_amt" in features
            and "gross_total" not in features
            and "net_total" not in features
            else features
        )

        # Adding prediction probabilities in front of every explanation.
        shap_data_detailed["shap_detailed_data_filtered"] = shap_data_detailed[
            "shap_detailed_data_filtered"
        ].apply(
            lambda features: "prediction_probabilities, " + features
            if pd.notna(features)
            else features
        )

        # Feature mapping has been done manually for all feature engineered columns.
        feature_mapping = {
            "prediction_probabilities": "This transaction is predicted",
            "cust_prof_id": "for Customer ID",
            "sales_hr": "Transaction done on hour",
            "cust_first_name": "Customer First Name",
            "cust_last_name": "Customer last name",
            "day_of_month": "Day of month",
            "day_of_week": "Day of week",
            "reduction_cd": "Reduction Code used by employee",
            "vendor_loc_id": "Vendor location ID",
            "vendor_id": "Vendor ID",
            "pos_venue_id": "Pos Venue ID",
            "payment_type": "Payment type used",
            "order_status_cd": "Order Status Code",
            "menu_vendor_id": "Menu Vendor ID",
            "cust_daywise_subtotal": "Customer Daywise Subtotal",
            "voucher_count": """Voucher is used {count} times""",
            "Refund_without_Reference": "Refund is claimed but no original order id value mentioned",
            "Incomplete_Refund": "Refund is claimed and order status cd",
            "Original_id_missing": "Order ID corresponding to original order id within 1 month not found",
            "taxes": "Taxes paid",
            "payment_amount": "Payment amount for the transaction",
            "reduction_amt": "reduction_amt",
            "net_total": "Net Total",
            "gross_total": "Gross Total for the transaction",
            "tip_amount": "Tip amount received for the transacation",
            "bus_date": "Business Transaction date",
            "device_order_id": "Device order ID",
            # "voucher_number": "Voucher Number Used for payment",
            "voucher_number": "",
            "order_id": "Order ID",
            "activity_status_code": "Activity status code",
            "card_type": "Card Type used for payment",
            "pos_terminal_id": "Pos Terminal ID",
        }

        combined_feature_mapping = {
            # Flag Features
            "FE_subtotal_over_50_fl": "Subtotal over $50 given",
            "FE_refund_over_15_days_fl": "Refund Over 15 days",
            "FE_refund_against_incomplete_fl": "Refund given to original order id having null status",
            "FE_multiple_voucher_fl": "Multiple voucher used",
            "FE_tip_gt_50_fl": "Tip greater than 50% of gross total",
            "FE_tip_gt_50_emp_fl": "Tip greater than 50% of gross total and employee id given",
            "FE_tip_gt_50_voucher_fl": "Tip greater than 50% of gross total and voucher used",
            # Ratio Features
            "FE_taxes_by_payment_amount_ratio": """Tax amount is {ratio}% of Payment Amount""",
            "FE_reduction_amt_by_payment_amount_ratio": """Reduction Amount is {ratio}% of Payment Amount""",
            "FE_net_total_by_gross_total_ratio": """Net Total Amount is {ratio}% of Gross Total""",
            "FE_tip_amount_by_payment_amount_ratio": """Tip amount is {ratio}% of Payment Amount""",
            "FE_tip_amount_by_gross_total_ratio": """Tip amount is {ratio}% of Gross Total""",
            # Count Agregation Features
            "FE_device_order_id_voucher_number_ct_AG": """this device order id has {count} different voucher numbers""",
            "FE_cust_prof_id_voucher_number_ct_AG": """this customer prof id has {count} different voucher numbers""",
            "FE_device_order_id_order_id_ct_AG": """this device order id has {count} different order ids""",
            "FE_cust_prof_id_order_id_ct_AG": """this customer prof id has {count} different order ids""",
        }

        dollar_features = [
            "gross_total",
            "net_total",
            "taxes",
            "exclusive_tax",
            "inclusive_tax",
            "fee_amt",
            "tip_amount",
            "payment_amount",
            "cust_daywise_subtotal",
        ]

        shap_data_detailed["Detailed_features_with_values"] = shap_data_detailed.apply(
            lambda row: ",  ".join(
                [
                    f"{feature_mapping.get(feature, feature)} is ${row[feature]}"
                    if feature in dollar_features
                    else f"{feature_mapping.get(feature, feature)} {row['fraud_category']} with probability {int(row[feature] * 100)}%"
                    if feature == "prediction_probabilities"
                    and "cust_prof_id" in row["shap_detailed_data_filtered"].split(", ")
                    else f"{feature_mapping.get(feature, feature)} {row['fraud_category']} with probability {int(row[feature]*100)}%  because"
                    if feature == "prediction_probabilities"
                    and "cust_prof_id"
                    not in row["shap_detailed_data_filtered"].split(", ")
                    else f"{feature_mapping.get(feature, feature)} {row[feature]} because"
                    if feature == "cust_prof_id"
                    else f"{feature_mapping.get(feature,feature)} is ${abs(row[feature])} which is {(abs(row['reduction_amt'])/row['gross_total'])*100:.2f}% of gross total"
                    if feature == "reduction_amt" and (row["gross_total"] != 0)
                    else f"{feature_mapping.get(feature,feature)} is {row[feature]}, reduction amount is ${abs(row['reduction_amt'])} which is {(abs(row['reduction_amt'])/row['gross_total'])*100:.2f}% of gross total"
                    if feature == "reduction_cd" and (row["gross_total"] != 0)
                    else f"{feature_mapping.get(feature,feature)} is {row['order_status_cd']}"
                    if feature == "Incomplete_Refund"
                    # Handling Voucher number and voucher count feature
                    else f"Voucher number is {row['voucher_number']}"
                    if (feature == "voucher_count") and (row["voucher_count"] >= 20)
                    else f"{feature_mapping.get(feature, feature).format(count=row['voucher_count'])} and voucher number is {row['voucher_number']}"
                    if (feature == "voucher_count") and (row["voucher_count"] < 20)
                    # Handling Voucher number duplication
                    else ""
                    if feature == "voucher_number"
                    else f"{combined_feature_mapping.get(feature,feature)}"
                    if feature == "FE_refund_against_incomplete_fl"
                    # Handling ratio features
                    else f"{combined_feature_mapping.get(feature,feature)} is {row[feature]}"
                    if feature.endswith("_fl")
                    # Handling ratio features
                    else f"{combined_feature_mapping.get(feature,feature).format(ratio=f'{row[feature]* 100:.2f}')}"
                    if feature.endswith("ratio")
                    # Handling Count Aggregate features
                    else f"{combined_feature_mapping.get(feature,feature).format(count=row[feature])}"
                    if (feature.endswith("_ct_AG")) & (row[feature] != 1)
                    else ""
                    if feature.endswith("_ct_AG")
                    # Remaining Missed Feature Mapping
                    else f"{feature_mapping.get(feature, feature)} is {row[feature]}"
                    for feature in row["shap_detailed_data_filtered"].split(", ")
                ]
            )
            if pd.notna(row["shap_detailed_data_filtered"])
            else np.nan,
            axis=1,
        )

        shap_data_detailed["shap_detailed_data_filtered"] = shap_data_detailed[
            "shap_detailed_data_filtered"
        ].apply(lambda features: features if pd.notna(features) else np.nan)

        return shap_data_detailed

    def _classify_fraud_category(self, row):
        features = row["SHAP_features"].split(" + ")
        tip_amount = row["tip_amount"] if pd.notna(row["tip_amount"]) else 0
        # tip_amount = row["tip_amount"]
        fraud_categories = []

        refund_features = [
            "Original_id_missing",
            "Incomplete_Refund",
            "FE_refund_against_incomplete_fl",
        ]
        refund_count = sum(feature in features for feature in refund_features)

        tips_feature = [
            "FE_tip_gt_50_emp_fl",
            "FE_tip_amount_by_payment_amount_ratio",
            "FE_tip_amount_by_gross_total_ratio",
        ]
        tips_count = sum(feature in features for feature in tips_feature)

        if refund_count >= 2:
            return "Refund Fraud"
        elif refund_count == 1 and "FE_device_order_id_order_id_ct_AG" in features:
            return "Refund Fraud"
        elif refund_count == 1:
            fraud_categories.append("Refund Fraud")

        if tips_count >= 2:
            return "Tips Fraud"
        elif tips_count == 1 and int(tip_amount) > 0:
            fraud_categories.append("Tips Fraud")

        if sum("reduction" in feature.lower() for feature in features) >= 2:
            return "Reduction Fraud"

        discount_conditions = [
            "reduction_cd",
            "FE_subtotal_over_50_fl",
            "mean_reduction_amt",
            "FE_employee_role_name_reduction_cd_CB",
            "std_reduction_amt",
            "mean_discount_per_cust",
            "cust_daywise_subtotal",
            "gross_total",
            "net_total",
        ]

        if any(feature in features for feature in discount_conditions) and any(
            "reduction" in feature.lower() for feature in features
        ):
            fraud_categories.append("Reduction Fraud")

        if sum("voucher" in feature.lower() for feature in features) >= 2:
            return "Voucher Fraud"

        # Voucher Fraud
        if ("voucher_count" in features or "voucher_number" in features) and any(
            feature in features
            for feature in ["taxes", "pos_terminal_id", "gross_total", "net_total"]
        ):
            fraud_categories.append("Voucher Fraud")

        elif "FE_device_order_id_order_id_ct_AG" in features and any(
            feature in features
            for feature in [
                "taxes",
                "voucher_count",
                "voucher_number",
                "net_total",
                "gross_total",
            ]
        ):
            fraud_categories.append("Voucher Fraud")

        if fraud_categories:
            return " / ".join(fraud_categories)
        else:
            return "Uncategorized Fraud"

    def predict(self, raw_test_data, raw_minute_shape):
        raw_test_data = raw_test_data.sort_values(by="bus_date")
        raw_minute_data = raw_test_data.iloc[-raw_minute_shape:]
        # raw_minute_data = raw_test_data.iloc[-raw_minute_shape :]

        preprocessed_test_data = self.preprocess_pipeline.transform(raw_test_data)
        
        
        
        for col in ["FE_multiple_voucher_fl","fee_amt", "item_count", "mobile_ord_fl", "payment_amt_rewards_points", "reduction_amt", "taxes", "tip_amount", "FE_taxes_by_payment_amount_ratio", "FE_reduction_amt_by_payment_amount_ratio", "FE_tip_amount_by_payment_amount_ratio", "FE_tip_amount_by_gross_total_ratio"]:
            preprocessed_test_data[col]=pd.to_numeric(preprocessed_test_data[col])
        
        preprocessed_test_data = preprocessed_test_data.iloc[-raw_minute_shape:]

        logging.log(
            level=logging.INFO,
            msg=f"Prediction data shape:{preprocessed_test_data.shape}",
        )

        predictions = self.model.predict_proba(
            preprocessed_test_data
        )  # numpy array of predictions

        # Add column of prediction probabilities to both raw and preprocessed data
        raw_minute_data["prediction_probabilities"] = predictions[:, 1]
        preprocessed_test_data["prediction_probabilities"] = predictions[:, 1]

        # Filter all rows over 0.1 threshold
        _data_for_shap = self._filter_columns_over_threshold(preprocessed_test_data)

        # Generate SHAP feature importance values for the data
        shap_values = self.explainer(
            _data_for_shap.drop(columns=["prediction_probabilities"]),
            check_additivity=False,
        )
        logging.log(level=logging.INFO, msg="SHAP explainer initialized")

        # Generate SHAP features
        SHAP_data = self._shap_feature_generation(_data_for_shap, shap_values.values)

        # Sort both raw and SHAP data by probability
        sorted_raw_data = self._sort_by_probability(raw_minute_data)
        sorted_SHAP_data = self._sort_by_probability(SHAP_data)

        # Generate list of additional columns, i.e. columns in sorted_raw_data not present in sorted_SHAP_data
        additional_columns = [
            col
            for col in sorted_SHAP_data.columns
            if col not in sorted_raw_data.columns
        ]

        # Join df of SHAP data with raw data
        result_df = sorted_raw_data.join(sorted_SHAP_data[additional_columns])

        shap_data_detailed = result_df[
            result_df["prediction_probabilities"] >= self.THRESHOLD
        ]

        logging.log(level=logging.INFO, msg="Computing Fraud Categories")
        shap_data_detailed["fraud_category"] = shap_data_detailed.apply(
            self._classify_fraud_category, axis=1
        )

        shap_data_refined = self._shap_explanation(shap_data_detailed)

        logging.log(level=logging.INFO, msg="Finalizing SHAP explanations")

        # shap_data_refined["fraud_category"] = shap_data_refined[
        #     "shap_detailed_data"
        # ].apply(self._classify_fraud_category)
        # shap_data_refined["fraud_category"] = shap_data_refined.apply(self._classify_fraud_category,axis=1)
        # shap_data_refined["fraud_category"] = shap_data_refined["fraud_category"].apply(
        #     lambda x: ", ".join(x)
        # )

        shap_data_refined.drop(
            columns=["shap_detailed_data_filtered", "shap_detailed_data"], inplace=True
        )
        result_df = result_df.drop(columns=additional_columns).join(
            shap_data_refined[["Detailed_features_with_values", "fraud_category"]],
            how="left",
        )
        result_df["Detailed_features_with_values"] = result_df[
            "Detailed_features_with_values"
        ].replace(",  ,  ", ",")

        return result_df, shap_data_refined


  from .autonotebook import tqdm as notebook_tqdm
