# ref 

- [그래프 머신러닝](https://product.kyobobook.co.kr/detail/S000200738068)

- [github](https://github.com/PacktPublishing/Graph-Machine-Learning)

- [Credit Card Transactions Fraud Detection Dataset](https://www.kaggle.com/datasets/kartik2112/fraud-detection?select=fraudTrain.csv)

- [컬리이미지](https://static.packt-cdn.com/downloads/9781800204492_ColorImages.pdf)

- [networkx](https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.set_edge_attributes.html)

# 신용카드 거래에 대한 그래프 분석

- 신용카드 거래 그래프 생성

- 그래프에서 속성 및 커뮤니티 추출

- 사기 거래 분류에 지도 및 비지도 머신러닝 알고리즘 적용

In [5]:
import pandas as pd

import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

default_edge_color = 'gray'
default_node_color = '#407cc9'
enhanced_node_color = '#f5b042'
enhanced_edge_color = '#cc2f04'

# 샘플 = 0.4

In [1]:
import pandas as pd
df = pd.read_csv("fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.40, random_state=42).append(df[df["is_fraud"] == 1])
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
669418,669418,2019-10-12 18:21,4.0891e+18,"fraud_Haley, Jewess and Bechtelar",shopping_pos,7.53,Debra,Stark,F,686 Linda Rest,...,32.3836,-94.8653,24536,Multimedia programmer,1983-10-14,d313353fa30233e5fab5468e852d22fc,1350066071,32.202008,-94.371865,0
32567,32567,2019-01-20 13:06,4247920000000.0,fraud_Turner LLC,travel,3.79,Judith,Moss,F,46297 Benjamin Plains Suite 703,...,39.537,-83.455,22305,Television floor manager,1939-03-09,88c65b4e1585934d578511e627fe3589,1327064760,39.156673,-82.930503,0
156587,156587,2019-03-24 18:09,4026220000000.0,fraud_Klein Group,entertainment,59.07,Debbie,Payne,F,204 Ashley Neck Apt. 169,...,41.5224,-71.9934,4720,Broadcast presenter,1977-05-18,3bd9ede04b5c093143d5e5292940b670,1332612553,41.657152,-72.595751,0
1020243,1020243,2020-02-25 15:12,4957920000000.0,fraud_Monahan-Morar,personal_care,25.58,Alan,Parsons,M,0547 Russell Ford Suite 574,...,39.6171,-102.4776,207,Network engineer,1955-12-04,19e16ee7a01d229e750359098365e321,1361805120,39.080346,-103.213452,0
116272,116272,2019-03-06 23:19,4178100000000000.0,fraud_Kozey-Kuhlman,personal_care,84.96,Jill,Flores,F,639 Cruz Islands,...,41.9488,-86.4913,3104,"Horticulturist, commercial",1981-03-29,a0c8641ca1f5d6e243ed5a2246e66176,1331075954,42.502065,-86.732664,0


In [2]:
df["is_fraud"].value_counts()

0    417028
1      6006
Name: is_fraud, dtype: int64

In [3]:
df["is_fraud"].value_counts()/len(df)

0    0.985803
1    0.014197
Name: is_fraud, dtype: float64

`-` 이분그래프

In [7]:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

- 판매자, 고객에게 node 할당

In [8]:
G_bu = build_graph_bipartite(df, nx.Graph(name="Bipartite Undirect"))

- 무향 그래프 작성

`-` 삼분그래프

In [9]:
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + #set으로 중복 제거
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    

- 판매자, 고객, 거래에 노드 할당

In [10]:
G_tu = build_graph_tripartite(df, nx.Graph())


In [11]:
for G in [G_bu, G_tu]:
    print(nx.number_of_nodes(G))

1636
424670


## 사기 탐지를 위한 지도 및 비지도 임베딩

- 트랜잭션 간선으로 표기

- 각 간선을 올바른 클래스(사기 또는 정상)으로 분류

### 지도학습

In [23]:
from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)

1    6006
0    6006
Name: is_fraud, dtype: int64


- 무작위 언더샘플링 사용

- 소수 클래스(사기거래)이 샘플 수 와 일치시키려고 다수 클래스(정상거래)의 하위 샘플을 가져옴

- 데이터 불균형을 처리하기 위해서

In [24]:
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

In [25]:
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

- 데이터 8:2 비율로 학습 검증

In [53]:
pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting gensim<5.0.0,>=4.1.2
  Downloading gensim-4.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tqdm<5.0.0,>=4.55.1
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting networkx<3.0,>=2.5
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

In [26]:
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Computing transition probabilities:   0%|          | 0/1630 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.62it/s]


- Node2Vec 알고리즘 사용해 특징 공간 구축

In [27]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 
    # 벡터스페이스 상에 edge를 투영.. 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred)) 

<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.7451737451737451
Recall: 0.16538131962296487
F1-Score: 0.270687237026648
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.691699604743083
Recall: 0.7497857754927164
F1-Score: 0.7195723684210527
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.7142857142857143
Recall: 0.029991431019708654
F1-Score: 0.0575657894736842
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.64
Recall: 0.027420736932305057
F1-Score: 0.052588331963845526


- Node2Vec 알고리즘 사용해 각 Edge2Vec 알고리즘으로 특징 공간 생성

- sklearn 파이썬 라이브러리의 RandomForestClassifier은 이전 단계에서 생성한 특징에 대해 학습

- 검증 테스트 위해 정밀도, 재현율, F1-score 성능 지표 측정

### 비지도학습

- k-means 알고리즘 사용

- 지도학습과의 차이점은 특징 공간이 학습-검증 분할을 안함.

In [28]:
nod2vec_unsup = Node2Vec(G_down, weight_key='weight')
unsup_vals = nod2vec_unsup.fit(window=10)

Computing transition probabilities:   0%|          | 0/1630 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.32it/s]


- 다운샘플링 절차에 전체 그래프 알고리즘 계산

In [29]:
from sklearn.cluster import KMeans

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
true_labels = [x for x in nx.get_edge_attributes(G_down, "label").values()]

for cl in classes:
    embedding_edge = cl(keyed_vectors=unsup_vals.wv) 

    embedding = [embedding_edge[str(x[0]), str(x[1])] for x in G_down.edges()]
    kmeans = KMeans(2, random_state=42).fit(embedding)
    
    
    nmi = metrics.adjusted_mutual_info_score(true_labels, kmeans.labels_)
    ho = metrics.homogeneity_score(true_labels, kmeans.labels_)
    co = metrics.completeness_score(true_labels, kmeans.labels_)
    vmeasure = metrics.v_measure_score(true_labels, kmeans.labels_)
    
    print(cl)
    print('NMI:', nmi)
    print('Homogeneity:', ho)
    print('Completeness:', co)
    print('V-Measure:', vmeasure)



<class 'node2vec.edges.HadamardEmbedder'>
NMI: 0.04336246478827236
Homogeneity: 0.0383178531539123
Completeness: 0.05011351404941123
V-Measure: 0.043428985282038965




<class 'node2vec.edges.AverageEmbedder'>
NMI: 0.11206093720015026
Homogeneity: 0.10817496918905492
Completeness: 0.11635805522328385
V-Measure: 0.11211739628609738




<class 'node2vec.edges.WeightedL1Embedder'>
NMI: 0.16558117117175825
Homogeneity: 0.16557714823761863
Completeness: 0.16568764408717976
V-Measure: 0.16563237773404058




<class 'node2vec.edges.WeightedL2Embedder'>
NMI: 0.1349652677966787
Homogeneity: 0.1337881599748603
Completeness: 0.1362723387302234
V-Measure: 0.13501882386803338


`-` NMI(Normalized Mutual Information)

- 두 개의 군집 결과 비교

- 0~1이며 1에 가까울수록 높은 성능

`-` Homogeneity

- 하나의 실제 군집 내에서 같은 군집에 속한 샘플들이 군집화 결과에서 같은 군집에 속할 비율

- 1에 가까울수록 높은 성능

`-` Completeness

- 하나의 예측 군집 내에서 같은 실제 군집에 속한 샘플들이 군집화 결과에서 같은 군집에 속할 비율

- 0~1이며 1에 가까울수록 높은 성능

`-` V-measure

- Homogeneity와 Completeness의 조화 평균

- 0~1이며 1에 가까울수록 높은 성능

- 비지도 학습에 이상치 탐지 방법

- k-means/LOF/One-class SVM 등이 있다.. 한번 같이 해보자.

- 조금씩 다 커졌넹..

`-` 지도학습에서 정상거래에서 다운샘플링을 했는데

만약, 사기거래에서 업샘플링을 하게되면 어떻게 될까?