# import

In [1]:
import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

    

In [2]:
    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def compute_time_difference(group):
        n = len(group)
        result = []
        for i in range(n):
            for j in range(n):
                time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
                result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
        return result

    def edge_index_save(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
        filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        while os.path.exists(filename):
            self.save_attempt += 1
            filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index
    
    def edge_index(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
       # filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        # while os.path.exists(filename):
        #     self.save_attempt += 1
        #     filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        # np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index

In [3]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

# 삼분그래프

In [4]:
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    

# 지도학습

### 0.3 / 0.2

In [5]:
df = throw(df, 0.3)

In [6]:
df_tr, df_tst = split_dataframe(df, 0.2)

In [7]:
df_tr.shape

(14014, 23)

In [8]:
df_tst.shape

(6006, 23)

In [9]:
df_tr.is_fraud.mean()

0.34287141429998574

In [10]:
df_tst.is_fraud.mean()

0.19996669996669997

In [11]:
df_, mask = concat(df_tr, df_tst)

In [12]:
G_down = build_graph_tripartite(df_)


In [33]:
G_down.number_of_edges(), G_down.number_of_nodes()

(40040, 21656)

In [62]:
edges = G_down.edges

In [None]:
sg.StellarGraph(edges=edges, edge_type_column="label")

In [49]:
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

In [50]:
np.array(train_labels).mean(), np.array(test_labels).mean()

(0.2995442057942058, 0.30182317682317683)

In [55]:
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

In [60]:
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder


In [61]:
graph = sg.StellarGraph(G_down)

  graph = sg.StellarGraph(G_down)


In [35]:
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Computing transition probabilities:   0%|          | 0/21656 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


In [36]:
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

In [37]:
np.array(train_embeddings).shape

(32032, 128)

In [38]:
np.array(train_embeddings)

array([[8.79964884e-03, 1.64067835e-01, 7.30881765e-02, ...,
        1.81038718e-04, 1.96826290e-02, 6.05850630e-02],
       [3.18445265e-04, 9.41526145e-02, 2.41454929e-01, ...,
        5.79439476e-02, 9.22752500e-01, 2.49633682e-03],
       [1.09851332e-02, 4.12013801e-03, 1.61135435e-01, ...,
        1.24859456e-02, 2.32662242e-02, 1.07970215e-01],
       ...,
       [6.13867212e-03, 8.51532519e-02, 7.86146245e-07, ...,
        6.33678436e-02, 7.13208392e-02, 2.39914820e-01],
       [2.59715295e-03, 1.24797074e-03, 1.26128927e-01, ...,
        7.45704547e-02, 2.37582775e-04, 2.18050033e-01],
       [3.34992632e-02, 2.74142623e-03, 4.02400009e-02, ...,
        2.08475683e-02, 4.62760888e-02, 3.03567946e-01]], dtype=float32)

In [39]:
np.array(train_labels).shape

(32032,)

In [40]:

# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)


In [41]:
df

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_119,X_120,X_121,X_122,X_123,X_124,X_125,X_126,X_127,label
0,8.799649e-03,0.164068,7.308818e-02,0.007493,0.000196,0.037998,0.026417,0.030260,0.045050,0.047880,...,0.001140,0.082948,0.018060,0.041995,0.073492,0.011187,1.810387e-04,0.019683,0.060585,1
1,3.184453e-04,0.094153,2.414549e-01,0.016275,0.078201,0.015268,0.000326,0.331411,0.067376,0.061926,...,0.223335,0.000269,0.041893,0.051856,0.191570,0.000165,5.794395e-02,0.922752,0.002496,0
2,1.098513e-02,0.004120,1.611354e-01,0.548673,0.025099,0.001323,0.005411,0.005461,0.021607,0.094634,...,0.070138,0.038877,0.014588,0.003246,0.205133,0.064290,1.248595e-02,0.023266,0.107970,0
3,5.125533e-04,0.069008,2.394260e-02,0.038580,0.000345,0.058848,0.005284,0.168507,0.327026,0.002053,...,0.001018,0.037014,0.015680,0.193011,0.000812,0.001481,4.044086e-02,0.065931,0.131638,0
4,2.551593e-01,0.011703,4.802953e-02,0.020095,0.177149,0.022541,0.196618,0.041765,0.000156,0.158181,...,0.233692,0.104250,0.091681,0.002522,0.874027,0.003707,3.469664e-05,0.015489,0.005535,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32027,1.361575e+00,0.023050,1.305244e-01,0.504093,0.014287,0.312939,0.000587,0.015945,0.049380,0.332058,...,0.002336,0.000683,0.191192,0.356860,0.227310,0.000661,1.307885e-01,0.023592,0.041412,0
32028,9.765987e-07,0.037045,1.420536e-02,0.026103,0.000017,0.116524,0.065952,0.054716,0.154811,0.011176,...,0.130505,0.006745,0.039672,0.052033,0.125356,0.015103,2.027648e-08,0.016023,0.169818,1
32029,6.138672e-03,0.085153,7.861462e-07,0.099862,0.408811,0.021244,0.006153,0.115799,0.000933,0.317840,...,0.177379,0.565970,0.022457,0.043790,0.059339,0.257367,6.336784e-02,0.071321,0.239915,0
32030,2.597153e-03,0.001248,1.261289e-01,0.000002,0.001143,0.248826,0.006462,0.016343,0.015120,0.108578,...,0.000021,0.000859,0.000894,0.496311,0.065650,0.007386,7.457045e-02,0.000238,0.218050,0


In [42]:
label = np.array(train_labels)

In [43]:
predictr = TabularPredictor(label='label')

No path specified. Models will be saved in: "AutogluonModels/ag-20240124_110704/"


In [44]:
predictr.fit(df) 

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_110704/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   623.47 GB / 982.82 GB (63.4%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    25264.96 MB
	Train Data (Original)  M

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd0b85c640>

In [46]:
test = np.array(test_embeddings)

In [47]:
test.shape

(8008, 128)

In [48]:
columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

# DataFrame 확인
print(test_df.head())

        X_0       X_1       X_2       X_3       X_4       X_5       X_6  \
0  0.413663  2.075492  0.097850  0.431519  0.026412  0.046006  0.139835   
1  0.012225  0.000547  0.024713  0.703247  0.020913  0.419119  0.352671   
2  0.195774  0.000752  0.009002  0.204248  0.070720  0.906959  0.507191   
3  0.260361  0.329479  0.229454  0.023667  0.001113  0.002806  0.034591   
4  0.427026  0.024197  0.695513  0.057896  0.270117  0.026265  0.320806   

        X_7       X_8       X_9  ...     X_118     X_119     X_120     X_121  \
0  0.014063  0.473687  0.014283  ...  0.048837  0.010120  0.005523  0.961210   
1  1.669539  0.126232  0.027797  ...  0.728475  0.378981  1.377197  0.203648   
2  0.267592  0.115088  0.409537  ...  0.295417  0.440117  0.088578  0.060138   
3  0.045546  0.623125  0.282554  ...  0.174775  0.159482  0.001048  0.035379   
4  0.029051  0.263759  0.028128  ...  0.008239  0.063498  0.810952  0.013463   

      X_122     X_123     X_124     X_125     X_126     X_127  
0  0

In [49]:
predictr.predict(test_df).mean()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f12cb80>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


0.0241008991008991

In [50]:
y = np.array(test_labels)

In [54]:
yhat = predictr.predict(test_df)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c03a0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [56]:
# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

    

In [57]:
evaluation(y,yhat)

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.690684,0.264249,0.021375,0.03955,0.498058


---

### 0.3 / 0.3

In [67]:
df = throw(df, 0.3)

df_tr, df_tst = split_dataframe(df, 0.3)

df_, mask = concat(df_tr, df_tst)

G_down = build_graph_tripartite(df_)

train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))



node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)


label = np.array(train_labels)

Computing transition probabilities:   0%|          | 0/21656 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:46<00:00,  4.61s/it]


In [68]:
predictr = TabularPredictor(label='label')

No path specified. Models will be saved in: "AutogluonModels/ag-20240124_112803/"


In [69]:
predictr.fit(df) 

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_112803/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   622.65 GB / 982.82 GB (63.4%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    24167.01 MB
	Train Data (Original)  M

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd8f0e0190>

In [70]:
test = np.array(test_embeddings)

In [71]:
columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

In [72]:
y = np.array(test_labels)

yhat = predictr.predict(test_df)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf01554c0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [73]:
evaluation(y,yhat)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.69493,0.0,0.0,0.0,0.5


---

### 0.3 / 0.4

In [75]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [76]:
df = throw(df, 0.3)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/21656 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:48<00:00,  4.86s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_113911/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_113911/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   621.83 GB / 982.82 GB (63.3%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.688686,0.350318,0.045852,0.081091,0.504741


---

### 0.4 / 0.4

In [77]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [78]:
df = throw(df, 0.4)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/16650 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00,  3.23s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002023/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002023/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   620.00 GB / 982.82 GB (63.1%)
Train Data Rows:    24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.599234,0.0,0.0,0.0,0.499584


---

### 0.4 / 0.3

In [79]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [80]:
df = throw(df, 0.4)
df_tr, df_tst = split_dataframe(df, 0.3)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/16650 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00,  3.28s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002828/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002828/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   619.33 GB / 982.82 GB (63.0%)
Train Data Rows:    24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.594406,0.464115,0.040066,0.073764,0.504412


---

### 0.5 / 0.3

In [85]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [86]:
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.3)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/13641 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004947/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004947/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   617.43 GB / 982.82 GB (62.8%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.508637,0.470588,0.00339,0.006731,0.499854


---

### 0.5 / 0.4

In [83]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [84]:
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/13641 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.57s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004313/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004313/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   617.99 GB / 982.82 GB (62.9%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.501977,0.0,0.0,0.0,0.5


---

### 0.5 / 0.2

In [87]:
df = pd.read_csv("~/Desktop/fraudTrain.csv")

In [88]:
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.2)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)

Computing transition probabilities:   0%|          | 0/13641 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_005617/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_005617/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   616.86 GB / 982.82 GB (62.8%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Featur

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
0,0.510926,0.0,0.0,0.0,0.499796
