# **\[FRAUD\]** 데이터정리

김보람  
2023-08-10

# imports

In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

In [79]:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

`-` 모든엣지를 고려하는 방법

In [19]:
N = 10 
edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
# edge_attr = 그래프의 웨이트 

In [20]:
edge_index

In [5]:
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

In [6]:
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

`-` 시간 차이 계산하려면?

In [5]:
diff = fraudTrain.trans_date_trans_time[101]-fraudTrain.trans_date_trans_time[0]

In [6]:
diff

In [93]:
diff.total_seconds()

`-` 적당한 theta값을 정하자.

In [55]:
theta = 86400*1.2
theta

In [8]:
theta = 86400*1.2
np.exp(-diff.total_seconds()/theta)

## 시도

In [20]:
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

In [25]:
N = len(fraudTrain)
N

-   `df02`을 이용해서 해보자.

In [7]:
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

In [12]:
214520*214520

In [16]:
# N = len(df02)
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T

-   `df50`

In [14]:
df50 = down_sample_textbook(df02)
df50.shape

In [15]:
12012*12012

### 고려할 것(230810)

-   df50 의 shape이 12000개 이므로 9000개의 T, 3000개의 F를 train mask로
    만들자.

-   고객정보가 동일하면 edge를 1로, 아니면 0으로 놓고 1에대한 weight를
    만들자.

-   g(V,E,W)에서의 weight

In [16]:
df50 = df50.reset_index()

In [17]:
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

In [18]:
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)

### 고려할 것(230810)2

-   현재 df50의 fraud 비율은 5:5 인데, 다른 비율을 가진 데이터로도
    해보자

-   GNN으로 돌려본 것과 다른 방법들과 비교를 해보자

-   undersampling한 다른 데이터들과 비교해 볼 수 있을 듯(boost, logis,
    …)

-   9000/3000 데이터를 통해 합성 데이터를 만드는데, 12000개를 그대로
    만드는 방법, 고객별로(cc_num) 합성 데이터를 만드는 방법, 똑같은
    cc_num로 특이한 데이터가 있다면 normal데이터와 특이 데이터를
    생각해서 돌리는 방법 등을 고려하자.

In [19]:
df50_tr.shape, df50_test.shape

In [20]:
N = len(df50_tr)
#edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
#edge_index

In [21]:
df50_tr = df50_tr.reset_index()

In [31]:
edge_index_list = []
for i in range(N):
    for j in range(N):
        time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])

In [32]:
edge_index = np.array(edge_index_list)

In [33]:
edge_index.shape

In [34]:
edge_index[:,2] = np.abs(edge_index[:,2])

In [35]:
theta = edge_index[:,2].mean()
theta

In [36]:
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)

In [37]:
edge_index

In [159]:
eee = edge_index[:,:]

In [161]:
eee[:,1]

In [38]:
edge_index_list_updated = edge_index.tolist()

In [39]:
edge_index_list_updated[:5]

In [53]:
df50_tr

-   cc_num로 그룹별로 묶자.

In [88]:
df50_tr[df50_tr['cc_num']==3.543590e+15]

In [43]:
df50_grouped=df50_tr.groupby(by='cc_num')

In [22]:
edge_index_list = []
for i in range(N):
    for j in range(N):
        if df50_tr['cc_num'][i] != df50_tr['cc_num'][j]:  # cc_num 값이 같다면
            time_difference = 0
        else:
            time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])


In [23]:
edge_index = np.array(edge_index_list)

In [24]:
edge_index.shape

In [25]:
edge_index

In [26]:
edge_index[:,2] = np.abs(edge_index[:,2])

In [27]:
theta = edge_index[:,2].mean()
theta

In [28]:
edge_index

In [29]:
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)

In [30]:
edge_index

In [31]:
edge_index_list_updated = edge_index.tolist()

In [32]:
np.array(edge_index_list_updated)[:,2].mean()

In [33]:
mm = np.array(edge_index_list_updated)[:,2].mean()

edge_index_list_updated가 w

In [34]:
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]

In [35]:
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()

In [36]:
edge_index_selected.shape

In [59]:
edge_index

### np.save(‘edge_index.npy’)

-   edge_index 돌아가는 게 너무 오래걸려서 이렇게 저장해놓으면 빠르게
    실행할 수 있다.

In [60]:
#import numpy as np

#data = np.array([1, 2, 3, 4, 5])
np.save('edge_index.npy', edge_index)

loaded_data = np.load('edge_index.npy')


-   npy로 끝나는 건 위에처럼 저장하기 아님 피클로!ㅡ, torch방법

In [37]:
x = df50_tr['amt']

In [38]:
x

In [39]:
a = torch.tensor(x, dtype=torch.float)

In [40]:
a = a.reshape(-1,1)
a

In [41]:
y = df50_tr['is_fraud']

In [42]:
b = torch.tensor(y,dtype=torch.int64)

In [43]:
b

In [44]:
import torch_geometric

In [45]:
data = torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b)

In [46]:
data

`-` pyg lesson6

In [49]:
gconv = torch_geometric.nn.GCNConv(1,4)
gconv

In [50]:
gconv(data.x, data.edge_index)

In [51]:
list(gconv.parameters())

In [203]:
_,W = list(gconv.parameters())
W

`-` pyg lesson5

In [53]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [54]:
model = GCN()

In [55]:
model

In [57]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

In [61]:
out

In [62]:
data.y

In [64]:
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out, data.y)
    loss.backward()
    optimizer.step()

In [66]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred == data.y).sum() # 애큐러시는 test
acc = int(correct) / 9009
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9633

In [73]:
fraud_mask = (data.y == 1)

In [77]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[fraud_mask] == data.y[fraud_mask]).sum() # 애큐러시는 test
acc = int(correct) / int(fraud_mask.sum())
print(f'recall: {acc:.4f}')

recall: 0.9619

-   위의 recall은 test가 없어서 train으로만 했던 거..!