## Trade 

* trade 데이터를 이용하여 거래 네트워크 생성

* 거래에 참여한 유저 즉, 네트워크의 node의 수는 약 12만명이다

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import networkx as nx
from sklearn.preprocessing import LabelEncoder

In [2]:
trade = pd.read_csv('./train_trade.csv')

In [3]:
le = LabelEncoder()
trade.source_acc_id = le.fit_transform(trade['source_acc_id'])
trade.target_acc_id = le.fit_transform(trade['target_acc_id'])

In [9]:
trade.item_type.value_counts() / trade.shape[0]

money        0.538428
grocery      0.461038
costume      0.000202
accessory    0.000173
gem          0.000105
weapon       0.000053
Name: item_type, dtype: float64

## Filter normal trade

* 재화간의 거래에 해당되는 normal trade(exchange) 를 제거하려고 하였다.

같은 trade week 의 같은 시간에 거래된 사건들을 제외하려고 필터링하였으나 주어진 데이터는 모두 one-way trade였다.

In [73]:
tr = trade.copy()

In [74]:
tr = tr.drop(['item_type','item_amount'], axis=1)
tr = tr.drop_duplicates()

In [75]:
tr['trade_week'] = tr['trade_week'].astype('str')
tr['trade_day'] = tr['trade_day'].astype('str')
tr['time'] = tr['trade_week'] + tr['trade_day'] + tr['trade_time']

tr['cmt'] = (tr['source_acc_id'] + tr['target_acc_id']).astype('str')
tr['cm']  = tr['time'] + tr['cmt']

gr = tr.groupby('cm')[['cmt']].count()

In [78]:
gr.value_counts()

cmt
1      3203993
2            8
dtype: int64

In [5]:
print(f'unique source node nums : {trade.source_acc_id.nunique()}')
print(f'unique target node nums : {trade.target_acc_id.nunique()}')

unique source node nums : 128812
unique target node nums : 71661


* item_type 을 money와 grocery 로 제한

In [104]:
trade = trade[(trade.item_type == 'grocery')|(trade.item_type == 'money')]

In [275]:
trade.to_csv('./p_trade.csv')

## Trade Network 생성

In [256]:
## node label 정보
def make_node(data):
    node1 = list(set(data.source_acc_id))
    node2 = list(set(data.target_acc_id))
    node_list = list(set(node1+node2))
    
    df = pd.DataFrame({'node_name' : node_list})  
    df['k'] = df['node_name'].apply(lambda x : node_k.get(x))
    return df

df = make_node(trade)

In [258]:
## node label
node_dict = df.set_index('node_name').T.to_dict()

In [261]:
## edge
G_imp = nx.from_pandas_edgelist(trade,'source_acc_id','target_acc_id', create_using=nx.DiGraph(), edge_attr= 'item_amount')
nx.set_node_attributes(G_imp, node_dict)

In [262]:
##save gexf file 
nx.write_gexf(G_imp, "./trade_net2.gexf")

## Network features

In [119]:
net = pd.read_csv('./net_features.csv')
print(net.shape)

(128777, 18)


##### 논문에서 Network structure 계산하는데 사용한 피쳐들

* degree centrality 의 평균 및 분산
* betweenness centrality 의 평균 및 분산
* degree assortativity
* clustering coefficient
* radius
* community size

In [148]:
#community size
community_size = list(net.groupby(['modularity_class'])['modularity_class'].count())

#degree centrality mean
degree_mean = list(net.groupby(['modularity_class'])['Degree'].mean())

#degree centrality std
degree_std = list(net.groupby(['modularity_class'])['Degree'].std())

#betweeness centrality_mean
bet_mean = list(net.groupby(['modularity_class'])['betweenesscentrality'].mean())

#betweeness centrality_std
bet_std = list(net.groupby(['modularity_class'])['betweenesscentrality'].std())

#clustering coefficient
cluster_coef = list(net.groupby(['modularity_class'])['clustering'].mean())

In [263]:
net

Unnamed: 0,Id,Label,timeset,indegree,outdegree,Degree,clustering,eigencentrality,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,modularity_class,componentnumber,strongcompnum,weighted indegree,weighted outdegree,Weighted Degree,k
0,31156,31156,,3,3,6,0.0,0.000200,6,0.306452,0.413158,6.225601e+01,14,0,0,3,3,6,4
1,42315,42315,,239,1,240,0.0,0.016104,9,0.229508,0.321117,1.349609e+04,14,0,0,239,1,240,4
2,35392,35392,,255,1,256,0.0,0.017162,5,0.333333,0.456667,5.523348e+03,14,0,0,255,1,256,4
3,54071,54071,,7,1,8,0.0,0.000461,20,0.086965,0.090116,4.359875e+02,18,0,0,7,1,8,0
4,53441,53441,,341,1,342,0.0,0.022799,19,0.095245,0.099498,2.904267e+07,30,0,0,341,1,342,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128772,126005,126005,,0,1,1,0.0,0.000000,22,0.079154,0.081865,0.000000e+00,6,0,0,0,1,1,2
128773,87414,87414,,0,1,1,0.0,0.000000,20,0.098622,0.103964,0.000000e+00,6,0,0,0,1,1,2
128774,108074,108074,,0,1,1,0.0,0.000000,4,0.400000,0.520833,0.000000e+00,13,0,0,0,1,1,0
128775,94126,94126,,0,1,1,0.0,0.000000,5,0.333333,0.456667,0.000000e+00,6,0,0,0,1,1,2


In [210]:
df = pd.DataFrame({'community_size' : community_size, 'degree_mean': degree_mean,'degree_std':degree_std,'betweenness_mean':bet_mean,
                   'betweenness_std':bet_std, 'clustering_coefficient':cluster_coef})
df.head()

Unnamed: 0,community_size,degree_mean,degree_std,betweenness_mean,betweenness_std,clustering_coefficient
0,12650,5.109486,150.351595,49461.583156,3801407.0,0.000392
1,5193,3.575005,103.894788,3959.037541,247122.7,0.0
2,5246,3.822531,103.234685,16315.746748,789889.0,0.00647
3,2542,4.254917,52.406499,7638.946502,201580.1,0.005034
4,1461,4.387406,24.960966,50469.719,887107.9,0.000592


In [212]:
from sklearn.cluster import KMeans

In [213]:
def make_cluster(data, columns, K):
    '''
    Set the optimal K to add a cluster column to the original data frame.

    data : dataframe
    columns : Features to be used for clustering
    K : optimum K
    '''
    data2 = data[columns]
    X = data2.copy()

    kmeanModel = KMeans(n_clusters=K, random_state=25).fit(X)
    kmeanModel.fit(X)
    cluster_labels = kmeanModel.labels_
    data['Cluster'] = cluster_labels

    return data

In [265]:
#k-means clustering
k_df = make_cluster(df, df.columns,5 )

unf_dic = dict(zip(k_df.index, k_df['Cluster']))

net['k'] = net['modularity_class'].apply(lambda x : unf_dic.get(x))

* 결과값을 다시 graph 처리를 위한 node 정보로 변환

In [267]:
net.head()

Unnamed: 0,Id,Label,timeset,indegree,outdegree,Degree,clustering,eigencentrality,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,modularity_class,componentnumber,strongcompnum,weighted indegree,weighted outdegree,Weighted Degree,k
0,31156,31156,,3,3,6,0.0,0.0002,6,0.306452,0.413158,62.25601,14,0,0,3,3,6,0
1,42315,42315,,239,1,240,0.0,0.016104,9,0.229508,0.321117,13496.09,14,0,0,239,1,240,0
2,35392,35392,,255,1,256,0.0,0.017162,5,0.333333,0.456667,5523.348,14,0,0,255,1,256,0
3,54071,54071,,7,1,8,0.0,0.000461,20,0.086965,0.090116,435.9875,18,0,0,7,1,8,0
4,53441,53441,,341,1,342,0.0,0.022799,19,0.095245,0.099498,29042670.0,30,0,0,341,1,342,0
