# **\[FRAUD\]** 데이터정리 시도(df50다른것들 시드고정)

김보람  
2023-09-18

# imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv





In [2]:
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [3]:
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

In [4]:
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

## 데이터정리

In [5]:
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

In [6]:
df50 = down_sample_textbook(df02)
df50 = df50.reset_index()
df50.shape

------------------------------------------------------------------------

### tr/test

In [7]:
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.shape, df50_test.shape

In [8]:
N = len(df50)
train_mask = [i in df50_tr.index for i in range(N)]
test_mask = [i in df50_test.index for i in range(N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
train_mask.sum(), test_mask.sum()

In [9]:
train_mask.shape, test_mask.shape

------------------------------------------------------------------------

### edge_index 설정

In [10]:
# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)

In [11]:
edge_index = np.load('edge_index_list_plus50.npy')
edge_index.shape

In [12]:
theta = edge_index[:,2].mean()
edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index = edge_index.tolist()
mean_ = np.array(edge_index)[:,2].mean()
mean_

In [13]:
edge_index[:5]

In [14]:
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape

------------------------------------------------------------------------

### data설정(x, edge_index, y)

In [15]:
x = torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df50['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data

------------------------------------------------------------------------

# 정리

| 구분  | Train   | Test      | 모형          | 설명변수 | 비고 |
|-------|---------|-----------|---------------|----------|------|
| 분석1 | df50_tr | df50_test | GNN           | amt      |      |
| 분석2 | df50_tr | df50_test | 로지스틱 회귀 | amt      |      |
| 분석3 | df50_tr | df50_test | SVM           | amt      |      |
| 분석4 | df50_tr | df50_test | 랜덤포레스트  | amt      |      |
| 분석5 | df50_tr | df50_test | 부스팅        | amt      |      |
| 분석6 | df50_tr | df50_test | Naive Bayes   | amt      |      |

In [46]:
lst = [_results1, _results2,_results3,_results4,_results5, _results6]
pd.concat(lst)

------------------------------------------------------------------------

## 분석 1(GNN)

In [16]:
torch.manual_seed(202250926)

In [17]:
X = (data.x[data.train_mask]).numpy()
XX = (data.x[data.test_mask]).numpy()
y = (data.y[data.train_mask]).numpy()
yy = (data.y[data.test_mask]).numpy()

In [18]:
model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
model.eval()

In [19]:
pred = model(data).argmax(dim=1)
yyhat = pred[data.test_mask]

In [20]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [21]:
_results1= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1

------------------------------------------------------------------------

## 분석2(로지스틱 회귀)

In [22]:
X = np.array(df50_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

In [23]:
lrnr = sklearn.linear_model.LogisticRegression()

In [24]:
lrnr.fit(X,y)

In [25]:
#thresh = y.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
yyhat = lrnr.predict(XX) 

In [26]:
yyhat

In [27]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [28]:
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2

## 분석3(서포트 벡터 머신)

In [29]:
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

In [30]:
lrnr = SVC(kernel='linear')  
lrnr.fit(X,y)
yyhat = lrnr.predict(XX)


In [31]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [32]:
_results3= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3

## 분석4(랜덤 포레스트)

In [33]:
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

In [34]:
lrnr = RandomForestClassifier()  
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)

In [35]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [36]:
_results4= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석4'])
_results4

## 분석5(부스팅)

In [37]:
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

In [38]:
lrnr = xgb.XGBClassifier()  
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)

In [39]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [40]:
_results5= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석5'])
_results5

## 분석6(Naive Bayes)

In [41]:
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)

In [42]:
lrnr = GaussianNB() 
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)

In [43]:
metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

In [44]:
_results6= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석6'])
_results6