In [4]:
import pandas as pd

df1 = pd.read_parquet('../data/jp_morgan/sorted/transaction_metrics_final_aml_v4_cbflag_mod.parquet')
df2 = pd.read_parquet('../data/jp_morgan/sorted/transaction_metrics_final_fraud_v4_cbflag_mod.parquet')

print(df1.columns)
print(df2.columns)

Index(['Time_step', 'Label', 'Transaction_Id', 'Transaction_Type',
       'std_txn_type', 'std_txn_method', 'is_credit', 'USD_amount', 'party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country', 'txn_time_hr', 'txn_time_mm', 'txn_age_days',
       'prev_std_txn_type', 'prev_std_txn_method', 'prev_USD_amount',
       'prev_age_delta', 'volume_7d_sum', 'velocity_7d_count',
       'stat_7d_median', 'stat_7d_mad', 'under_threshold_7d_count',
       'under_threshold_7d_sum', 'volume_14d_sum', 'velocity_14d_count',
       'stat_14d_median', 'stat_14d_mad', 'under_threshold_14d_count',
       'under_threshold_14d_sum', 'volume_30d_sum', 'velocity_30d_count',
       'stat_30d_median', 'stat_30d_mad', 'under_threshold_30d_count',
       'under_threshold_30d_sum', 'is_crossborder', 'stat_7d_modzscr',
       'stat_14d_modzscr', 'stat_30d_modzscr'],
      dtype='object')
Index(['Time_step', 'Label', 'Transaction_Id', 'Transaction_Type',
       'std_

In [5]:
df1_bwn = pd.read_parquet('../data/graph/aml/betweenness.parquet').reset_index()
df1_deg = pd.read_parquet('../data/graph/aml/degree.parquet').reset_index()
df1_wdg = pd.read_parquet('../data/graph/aml/weighted_degree.parquet').reset_index()

# recreate df2 because you saved the wrong files in v4
# use the train test split
df2_bwn = pd.read_parquet('../data/graph/fraud/betweenness.parquet').reset_index()
df2_deg = pd.read_parquet('../data/graph/fraud/degree.parquet').reset_index()
df2_wdg = pd.read_parquet('../data/graph/fraud/weighted_degree.parquet').reset_index()

In [6]:
def add_graph_acct_ids(df):
    def create_account_id(id_col, account_col):
        if pd.isna(id_col) and pd.isna(account_col):
            return None
        elif pd.isna(id_col):
            return account_col
        elif pd.isna(account_col):
            return id_col
        else:
            return f'{id_col}_{account_col}'

    df['party_account_id'] = df.apply(lambda row: create_account_id(row['party_Id'], row['party_Account']), axis=1)
    df['cparty_account_id'] = df.apply(lambda row: create_account_id(row['cparty_Id'], row['cparty_Account']), axis=1)

    return df

df1 = add_graph_acct_ids(df1)
df2 = add_graph_acct_ids(df2)

In [7]:
df1_bwn.columns = ['node_id', 'graph_metric_btw']
df1_deg.columns = ['node_id', 'graph_metric_deg']
df1_wdg.columns = ['node_id', 'graph_metric_wdg']

df2_bwn.columns = ['node_id', 'graph_metric_btw']
df2_deg.columns = ['node_id', 'graph_metric_deg']
df2_wdg.columns = ['node_id', 'graph_metric_wdg']

In [21]:
test_df = df1.copy()

In [23]:
df1 = df1.merge(df1_bwn, how='left', left_on='party_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'party_entity_btw'})
df1 = df1.merge(df1_deg, how='left', left_on='party_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'party_entity_deg'})

df1 = df1.merge(df1_bwn, how='left', left_on='party_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'party_account_btw'})
df1 = df1.merge(df1_deg, how='left', left_on='party_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'party_account_deg'})

df1 = df1.merge(df1_bwn, how='left', left_on='cparty_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'cparty_l1_btw'})
df1 = df1.merge(df1_deg, how='left', left_on='cparty_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'cparty_l1_deg'})

df1 = df1.merge(df1_bwn, how='left', left_on='cparty_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'cparty_l2_btw'})
df1 = df1.merge(df1_deg, how='left', left_on='cparty_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'cparty_l2_deg'})

# First create cparty_is_cash indicator and set its zeros
df1['cparty_is_cash'] = df1['cparty_Id'].isna().astype(int)
cash_columns = ['cparty_l1_btw', 'cparty_l1_deg', 'cparty_l2_btw', 'cparty_l2_deg']
df1.loc[df1['cparty_is_cash'] == 1, cash_columns] = 0

# Create standalone indicator and set l2 columns to zero
df1['cparty_is_standalone'] = (
    (~df1['cparty_Id'].isna()) & 
    (df1['cparty_Account'].isna())
).astype(int)
l2_columns = ['cparty_l2_btw', 'cparty_l2_deg']
df1.loc[df1['cparty_is_standalone'] == 1, l2_columns] = 0

# Account indicator remains the same
df1['cparty_is_account'] = (
    (~df1['cparty_Id'].isna()) & 
    (~df1['cparty_Account'].isna())
).astype(int)

df1 = df1.drop(['party_account_id', 'cparty_account_id'], axis=1)

In [24]:
df2 = df2.merge(df2_bwn, how='left', left_on='party_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'party_entity_btw'})
df2 = df2.merge(df2_deg, how='left', left_on='party_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'party_entity_deg'})

df2 = df2.merge(df2_bwn, how='left', left_on='party_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'party_account_btw'})
df2 = df2.merge(df2_deg, how='left', left_on='party_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'party_account_deg'})

df2 = df2.merge(df2_bwn, how='left', left_on='cparty_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'cparty_l1_btw'})
df2 = df2.merge(df2_deg, how='left', left_on='cparty_Id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'cparty_l1_deg'})

df2 = df2.merge(df2_bwn, how='left', left_on='cparty_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_btw': 'cparty_l2_btw'})
df2 = df2.merge(df2_deg, how='left', left_on='cparty_account_id', right_on='node_id',).drop('node_id', axis=1).rename(columns={'graph_metric_deg': 'cparty_l2_deg'})

# First create cparty_is_cash indicator and set its zeros
df2['cparty_is_cash'] = df2['cparty_Id'].isna().astype(int)
df2.loc[df2['cparty_is_cash'] == 1, cash_columns] = 0

# Create standalone indicator and set l2 columns to zero
df2['cparty_is_standalone'] = (
    (~df2['cparty_Id'].isna()) & 
    (df2['cparty_Account'].isna())
).astype(int)
df2.loc[df2['cparty_is_standalone'] == 1, l2_columns] = 0

# Account indicator remains the same
df2['cparty_is_account'] = (
    (~df2['cparty_Id'].isna()) & 
    (~df2['cparty_Account'].isna())
).astype(int)

df2 = df2.drop(['party_account_id', 'cparty_account_id'], axis=1)
