In [70]:
# bigcon03_review

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
from IPython.display import display
import scipy as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

pd.options.display.max_columns = None   # To see the entire columns of dataframes
pd.options.display.max_rows = 100    

In [71]:
train_combat = pd.read_csv("data/bigCon/bigcon_data/train_combat.csv")
train_payment = pd.read_csv("data/bigCon/bigcon_data/train_payment.csv")
train_pledge = pd.read_csv("data/bigCon/bigcon_data/train_pledge.csv")
train_trade = pd.read_csv("data/bigCon/bigcon_data/train_trade.csv")
train_activity = pd.read_csv("data/bigCon/bigcon_data/train_activity.csv")
train_label = pd.read_csv("data/bigCon/bigcon_data/train_label.csv")

In [72]:
combat = train_combat.copy()
pledge = train_pledge.copy()
payment = train_payment.copy()
trade = train_trade.copy()
activity = train_activity.copy()

In [73]:
# Total amount spent per account the whole period
train_label["total_spent"] = train_label["survival_time"] * train_label["amount_spent"]
label = train_label["acc_id"]

In [74]:
# No need to keep the rows whose "acc_id" does not appear in Label data
combat = combat[combat["acc_id"].isin(label)]
pledge = pledge[pledge["acc_id"].isin(label)]
payment = payment[payment["acc_id"].isin(label)]
trade = trade[trade["source_acc_id"].isin(label)]
trade = trade[trade["target_acc_id"].isin(label)]
activity = activity[activity["acc_id"].isin(label)]
# activity
# isin() - 반환값 : df내 각각의 요소가 값에 포함되어 있는 지 여부를 보여주는 불린형의 df
# isin() 메소드를 사용하여 리스트에 

In [75]:
# Common Features 정리

# The number of characters per account
all_char = pd.concat([train_combat[["char_id", "acc_id"]], 
                      train_pledge[["char_id", "acc_id"]], 
                      train_activity[["char_id", "acc_id"]]])
# all_char
char_count = all_char.groupby("char_id")["acc_id"].count()
# char_count
char_count = pd.DataFrame({"char_count" : char_count})  # in case of merging
# char_count
# { 딕셔너리 칼럼명 , : 변수기입 }

In [76]:
# Find all the unique servers
total_server = pd.concat([train_combat["server"], train_pledge["server"], train_trade["server"], train_activity["server"]])
server_list = total_server.unique()

# Encode servers into integer values
le = LabelEncoder()
le.fit(server_list)
for df in [combat, pledge, trade, activity]:
    df["server"] = le.transform(df["server"])

# 해당 열들에 대해서 server 열 series를 를 인코더에 넣어 자동맞춤시킴

In [77]:
## trade
trade.rename(columns={"source_acc_id" : "acc_id"}, inplace=True)
# return : DataFrame with the renamed axis labels.
# inplace : bool, default False
trade.columns
# source_add_id ==> acc_id

Index(['day', 'time', 'type', 'server', 'acc_id', 'source_char_id',
       'target_acc_id', 'target_char_id', 'item_type', 'item_amount',
       'item_price'],
      dtype='object')

In [78]:

# Remove "source_char_idd" and "target_char_id"
trade.drop(["source_char_id", "target_char_id"], axis=1, inplace=True)
# 데이터 옆으로 연결 
trade.columns
# source_char_id 와 target_char_id 열 drop 

Index(['day', 'time', 'type', 'server', 'acc_id', 'target_acc_id', 'item_type',
       'item_amount', 'item_price'],
      dtype='object')

In [79]:
trade['item_price']
# 해당 열의 결측치를 item_price 중간값으로 변환 해준다 

3               NaN
6               NaN
9          0.013231
10         0.032727
11              NaN
13              NaN
15              NaN
23         0.002231
24              NaN
25         0.001316
28              NaN
38              NaN
39         0.060408
46              NaN
49              NaN
59         0.037764
61         0.130907
63         0.007372
65         0.411534
66              NaN
69              NaN
71         0.513337
74              NaN
78         0.009119
83         0.013740
84              NaN
87              NaN
90         0.082921
91              NaN
92         0.000813
95              NaN
96              NaN
97         0.010794
98              NaN
105        0.005355
108        0.001970
115        0.449871
119        0.253614
122             NaN
123        0.448562
124        0.639051
128        0.034953
131        0.248723
133             NaN
134        0.001701
138        0.043087
139             NaN
141        0.028052
142             NaN
147             NaN


In [80]:
# Substitute NaNs with median at "item_price"
trade["item_price"].fillna(trade["item_price"].median(), inplace=True)
# 변수를 직접 변경하겠다 
type(trade["item_price"])
trade["item_price"].value_counts()



0.018362    424699
0.005610      2544
0.006545      1581
0.002805       755
0.009351       592
0.000561       536
0.004675       520
0.092570       441
0.018701       427
0.000281       414
0.001870       392
0.001122       392
0.014026       351
0.000935       351
0.001683       347
0.028052       342
0.046753       326
0.140258       317
0.001403       315
0.005610       302
0.003740       298
0.003273       294
0.000655       290
0.032727       250
0.023376       237
0.001964       224
0.000468       224
0.056103       214
0.093505       213
0.007480       213
0.002338       210
0.037402       207
0.370280       204
0.000112       203
0.002244       196
0.280515       195
0.016831       189
0.008228       183
0.004582       179
0.011221       178
0.065454       177
0.008415       177
0.004208       175
0.185140       171
0.000842       170
0.001309       163
0.051428       162
0.000187       162
0.002431       162
0.006545       157
             ...  
0.135966         1
0.705846    

In [81]:
type(trade["time"])

pandas.core.series.Series

In [82]:
# Categorize transaction time
bins = [0, 60000, 120000, 180000, 239999]
bin_label = [0, 1, 2, 3]   
trade["time_bin"] = pd.cut(pd.to_numeric(trade["time"].str.replace(":", "")),
                           bins=bins, labels=bin_label)
type(trade["time_bin"])
# cut
# 실수 값의 경계선을 지정하는 경우
# trade df의 time column 의 문자열, : 기호를 공백으로 바꾸고 numeric형으로 변환 
# cut(bins, labels : 배열, 불형 bins의 결과값과 같은 길이여야한다 ) 
# 반환값 series, 범주형, ndarray형이며, 스칼라값의 연속 
# 반환값 bins : 

pandas.core.series.Series

In [83]:


# Those who play during the time other than evening must be differentiated
trade["time_bin"] = trade["time_bin"].map({0:1, 1:1, 2:1, 3:0}) 
trade.drop("time", axis=1, inplace=True)
# type(trade["time"])

# map(함수, 반복가능한 자료형)
# : 입력받은 자료형 요소의 함수 결과값을 묶어 반환함 



In [84]:
# Total price per each trade
trade["total_item_price"] = trade["item_amount"] * trade["item_price"]

# Make two trade DataFrames for each source and target
source_trade = trade.drop("target_acc_id", axis=1)
target_trade = trade.drop("acc_id", axis=1)
target_trade.rename(columns={"target_acc_id" : "acc_id"}, inplace=True)



In [85]:
# Total number of trade occurence per account the whole period
# 해당 도메인 카운팅 

source_trade_count = source_trade["acc_id"].value_counts()
# source_trade_count
target_trade_count = target_trade["acc_id"].value_counts()
# target_trade_count

source_trade_count = pd.DataFrame({"acc_id" : source_trade_count.index,
                                   "count" : source_trade_count})  
# source_trade_count
# Make a dataframe to merge based on "acc_id"
target_trade_count = pd.DataFrame({"acc_id" : target_trade_count.index,
                                   "count" : target_trade_count})
# target_trade_count

trade_count = pd.merge(source_trade_count, 
                       target_trade_count,
                       on = "acc_id")
trade_count
# trade_count["count"] = trade_count["count_x"] + trade_count["count_y"]
# trade_count.drop(["count_x", "count_y"], axis=1, inplace=True)



Unnamed: 0,acc_id,count_x,count_y
0,50413,3552,3480
1,111023,2496,1741
2,59264,2039,191
3,120428,1834,2854
4,97968,1727,870
5,3139,1619,1609
6,116315,1451,1344
7,52260,1448,1245
8,91062,1409,1848
9,92665,1384,1151


In [86]:
# Remove columns that do not seem necessary
trade.drop(["server", "target_acc_id", "item_type", 
            "item_amount", "item_price"], axis=1, inplace=True)

trade = pd.concat(
    [ pd.pivot_table(data=trade, index=["acc_id", "day"], 
                     values=["time_bin", "total_item_price"], aggfunc=sum), # sum all the prices
    pd.pivot_table(data=trade, index=["acc_id", "day"], 
                   values=["type"])], # mean for trade type
axis=1)

In [87]:
# pledge

In [88]:
pledge.columns
# play_char_cnt
# 접속률에 따른 점수 부여 
# 접속률 1위 혈맥 1 
# 나머지 혈명이 1위 혈맹 대비 비례한 점수 부여.

Index(['day', 'acc_id', 'char_id', 'server', 'pledge_id', 'play_char_cnt',
       'combat_char_cnt', 'pledge_combat_cnt', 'random_attacker_cnt',
       'random_defender_cnt', 'same_pledge_cnt', 'temp_cnt', 'etc_cnt',
       'combat_play_time', 'non_combat_play_time'],
      dtype='object')

In [89]:
# 각 혈맹의 평균 유저 접속률 지표
pledge_pivot = pd.pivot_table(data=pledge, index=['pledge_id'], 
                              values='play_char_cnt', aggfunc='mean')

# 정렬 후 1위 값으로 나눔 
# => 접속률 1위인 혈맹 1.0, 나머지 1.0보다 차례로 낮은 값.
avg_play_rate_per_pledge =\
pledge_pivot.play_char_cnt.sort_values(ascending=False)\
/ pledge_pivot.play_char_cnt.sort_values(ascending=False).iloc[0]

In [90]:
to_be_merged = pd.DataFrame({'avg_play_rate_per_pledge' : avg_play_rate_per_pledge})
to_be_merged.head(10)

Unnamed: 0_level_0,avg_play_rate_per_pledge
pledge_id,Unnamed: 1_level_1
17429,1.0
7891,0.958941
27662,0.941473
3703,0.939236
33924,0.934305
23128,0.87329
4499,0.83641
9652,0.82583
28873,0.784145
3659,0.773146


In [91]:
pledge = pd.merge(pledge, to_be_merged, on='pledge_id') # pledge_id 칼럼에 
to_be_merged
pledge 
# 혈맹 data + to_be_merged
# 명령은 두 데이터 프레임의 공통 열 혹은 인덱스를 기준으로 
# 두 개의 테이블을 합친다.
# 이 때 기준이 되는 열, 행의 데이터를 키(key)라고 한다

Unnamed: 0,day,acc_id,char_id,server,pledge_id,play_char_cnt,combat_char_cnt,pledge_combat_cnt,random_attacker_cnt,random_defender_cnt,same_pledge_cnt,temp_cnt,etc_cnt,combat_play_time,non_combat_play_time,avg_play_rate_per_pledge
0,1,106660,345122,0,381,0.036089,0.036089,0.0,0.000000,0.000000,0.0,0.000000,0.007078,0.027568,0.000000,0.005088
1,2,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.017490,0.005088
2,3,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.022008,0.005088
3,4,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.035908,0.005088
4,5,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.032201,0.005088
5,6,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.029653,0.005088
6,7,106660,345122,0,381,0.036089,0.036089,0.0,0.000000,0.000000,0.0,0.098883,0.000000,0.045869,0.000000,0.005088
7,8,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.023977,0.005088
8,9,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.020270,0.005088
9,10,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.027915,0.005088


In [92]:
# 접속률 1위 혈맹확인
# pledge[pledge.pledge_id==17429]

In [93]:
# pledge_combat_cnt
# 마찬가지로 혈맹간 전투(공성 등) 같은 엔드 컨텐츠를 
# 많이 즐기는 혈맹의 유저일 수록 생존률이 높을거라 예상
# --> 혈맹간 전투 횟수에 점수 부여. 위의 접속률과 같은 방식으로 점수 부여.

In [94]:
# 혈맹간 총 전투 수 체크
pledge_pivot = pd.pivot_table(data=pledge, index=['pledge_id'], 
                              values='pledge_combat_cnt', aggfunc='sum')
pledge_pivot
total_combat_cnt_per_pledge =\
pledge_pivot.pledge_combat_cnt.sort_values(ascending=False)\
# 내림차순 정렬 
/ pledge_pivot.pledge_combat_cnt.sort_values(ascending=False).iloc[0]

17835.39592205797

In [95]:
to_be_merged = pd.DataFrame({'total_combat_cnt_per_pledge':
                             total_combat_cnt_per_pledge})
to_be_merged.head(10)

Unnamed: 0_level_0,total_combat_cnt_per_pledge
pledge_id,Unnamed: 1_level_1
36551,17835.395922
13214,9993.761308
38266,8200.333324
36909,7974.25752
9427,7425.418443
21739,6162.747989
17546,5611.362779
17834,5506.155418
10521,4717.020078
3582,4626.311831


In [96]:
pledge = pd.merge(pledge, to_be_merged, on='pledge_id')
pledge
# pledge + to_be_merged => pledge_id // merge

Unnamed: 0,day,acc_id,char_id,server,pledge_id,play_char_cnt,combat_char_cnt,pledge_combat_cnt,random_attacker_cnt,random_defender_cnt,same_pledge_cnt,temp_cnt,etc_cnt,combat_play_time,non_combat_play_time,avg_play_rate_per_pledge,total_combat_cnt_per_pledge
0,1,106660,345122,0,381,0.036089,0.036089,0.0,0.000000,0.000000,0.0,0.000000,0.007078,0.027568,0.000000,0.005088,0.0
1,2,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.017490,0.005088,0.0
2,3,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.022008,0.005088,0.0
3,4,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.035908,0.005088,0.0
4,5,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.032201,0.005088,0.0
5,6,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.029653,0.005088,0.0
6,7,106660,345122,0,381,0.036089,0.036089,0.0,0.000000,0.000000,0.0,0.098883,0.000000,0.045869,0.000000,0.005088,0.0
7,8,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.023977,0.005088,0.0
8,9,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.020270,0.005088,0.0
9,10,106660,345122,0,381,0.036089,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.027915,0.005088,0.0


In [97]:
# 혈맹간 전투 수 1위 혈맹 확인
pledge[pledge.pledge_id==36551]

Unnamed: 0,day,acc_id,char_id,server,pledge_id,play_char_cnt,combat_char_cnt,pledge_combat_cnt,random_attacker_cnt,random_defender_cnt,same_pledge_cnt,temp_cnt,etc_cnt,combat_play_time,non_combat_play_time,avg_play_rate_per_pledge,total_combat_cnt_per_pledge
428286,1,86413,35092,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428287,1,128409,20809,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428288,1,57985,1233,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428289,1,117569,202916,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428290,1,42561,395794,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428291,1,82662,377980,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428292,1,40748,80418,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428293,1,12203,298219,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428294,1,44591,241221,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922
428295,1,89162,287358,14,36551,6.207363,2.526252,0.327828,1.258770,0.071531,2.387152,2.570959,3.397567,6.900750,0.0,0.677177,17835.395922


In [98]:
# The number of members in each pledge
pledge_num_people = pledge["pledge_id"].value_counts()
# pledge_num_people
# Remove "char_id", "server", "pledge_id" (doesn't seem to matter)
pledge.drop(["char_id", "server", "pledge_id"], axis=1, inplace=True)

# Sum all the values according to days per account
pledge = pd.concat(
    [pd.pivot_table(data=pledge.drop(
        ['avg_play_rate_per_pledge','total_combat_cnt_per_pledge'],
        axis=1), index=["acc_id", "day"], aggfunc=sum),
     pd.pivot_table(data=pledge, index=['acc_id','day'], 
                    values=['avg_play_rate_per_pledge',
                            'total_combat_cnt_per_pledge'])], axis=1)

# Change name of the columns which are same with the ones in combat file
rename_dict = {"etc_cnt" : "p_etc_cnt", "random_attacker_cnt" : "p_random_attacker_cnt",
              'same_pledge_cnt' : 'p_same_pledge_cnt', "temp_cnt" : "p_temp_cnt", 
               "random_defender_cnt" : "p_random_defender_cnt"}
pledge.rename(columns=rename_dict, inplace=True)

In [99]:
## activity
# Remove "char_id"
activity.drop("char_id", axis=1, inplace=True)

# Total play time per account the whole period
total_play = activity.groupby("acc_id")["playtime"].sum()
total_play = pd.DataFrame({"total_play_time" : total_play})  # in case of merging

# activity["cum_play_time"] = activity.groupby("acc_id")["playtime"].cumsum()

# Sum all the values per day for each account
activity = pd.concat([pd.pivot_table(data=activity.drop("server", axis=1), 
                                     index=["acc_id", "day"], aggfunc=sum),
                     pd.pivot_table(data=activity, index=["acc_id", "day"], 
                                    values=["server"])],axis=1)
# aggfunc 으로 정의된 함수 도출 대표값을 삼는다 

# Total exp
activity["total_exp"] = activity["solo_exp"] + activity["party_exp"] + activity["quest_exp"]

# How long do users spend time fishing? # 피싱시간비율
activity["fishing_prop"] = activity["fishing"] / activity["playtime"]
activity["fishing_prop"]

acc_id  day
2       1      0.000000
        2      0.000000
        3      0.000000
        4      0.000000
        5      0.000000
        6      0.000000
        7      0.000000
        8      0.000000
        9      0.000000
        10     0.000000
        11     0.000000
        12     0.000000
        13     0.000000
        14     0.000000
        15     0.000000
        16     0.000000
        17     0.000000
        18     0.000000
        19     0.000000
        20     0.000000
        21     0.000000
        22     0.000000
        23     0.000000
        24     0.000000
        25     0.000000
        26     0.000000
        27     0.000000
        28     0.000000
5       9      0.000000
        14     0.000000
        20     0.000000
        22     0.000000
        23     0.034685
        24     0.000000
        25     0.000000
        26     0.454693
        27     0.000000
        28     0.000000
8       1      0.000000
        2      0.000000
        3      0.445545
    

In [101]:
## combat
combat['class'] = combat['class'].astype('category') # 범주형욿 변환 
type(combat['class']) # series = category

pandas.core.series.Series

In [102]:
# 클래스별 전체 비율 조사
prop_class = combat['class'].value_counts() / combat['class'].value_counts().sum()
prop_class
# 전체로 나눈다 

1    0.208301
2    0.170270
3    0.156541
7    0.137134
4    0.135188
5    0.084303
0    0.081987
6    0.026275
Name: class, dtype: float64

In [None]:
#################### 뇌진탕 증상중 #######################

In [103]:
# 전체 누적합 60% 미만을 차지하는 주류 클래스 조사
class60 = prop_class[~(prop_class.cumsum()>0.6)].index 
# 기사, 요정, 마법사 (70%로 하면 전사까지.)
# cumsum은 배열에서 주어진 축에 따라 누적되는 
# 원소들의 누적 합을 계산하는 함수.

def isMajorClass(classs):
    if classs in class60:
        return 1
    else:
        return 0

combat['isMajorClass'] = combat['class'].apply(isMajorClass)
combat['isMajorClass'].head(10)

0    1
1    1
2    1
3    0
4    1
5    1
6    0
7    1
8    1
9    0
Name: isMajorClass, dtype: int64

In [104]:
# Remove "char_id", "class", "server"
combat.drop(["char_id", "class", "server"], axis=1, inplace=True)

# Max level? or Mean level?
max_level = combat.groupby("acc_id")["level"].max()
mean_level = combat.groupby("acc_id")["level"].mean()
combat.drop("level", axis=1, inplace=True)

# Sum all the values per day for each account
combat = pd.concat([pd.pivot_table(data=combat.drop('isMajorClass', axis=1), 
                                   index=["acc_id", "day"], aggfunc=sum),
                   pd.pivot_table(data=combat, index=['acc_id', 'day'], 
                                  values='isMajorClass', aggfunc='mean')], axis=1)
# isMajorClass mean 처리해서 주류 클래스만 플레이했을 경우 1.0, 그 외에는 비주류 클래스 캐릭터 수에 따라 평균값 떨어짐

In [105]:
### payment #### 

In [106]:
train_label[train_label["acc_id"] == 8]

Unnamed: 0,acc_id,survival_time,amount_spent,total_spent
5494,8,64,0.02031,1.299843


In [107]:
display(combat.tail())
display(payment.head())
display(pledge.head())
display(trade.head())
display(activity.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,etc_cnt,num_opponent,pledge_cnt,random_attacker_cnt,random_defender_cnt,same_pledge_cnt,temp_cnt,isMajorClass
acc_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
130473,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130473,25,0.0,1.079416,1.026948,0.0,0.0,0.0,0.479728,0.0
130473,26,0.225223,5.985854,7.317004,0.0,0.0,0.0,0.0,0.0
130473,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130473,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,day,acc_id,amount_spent
0,18,27835,0.826123
1,23,27835,0.011735
2,27,27835,0.035204
3,17,12351,0.528062
4,11,125437,0.633674


Unnamed: 0_level_0,Unnamed: 1_level_0,combat_char_cnt,combat_play_time,p_etc_cnt,non_combat_play_time,play_char_cnt,pledge_combat_cnt,p_random_attacker_cnt,p_random_defender_cnt,p_same_pledge_cnt,p_temp_cnt,avg_play_rate_per_pledge,total_combat_cnt_per_pledge
acc_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5,9,0.252625,0.399849,0.120331,0.0,0.649608,0.0,0.0,0.143063,0.0,0.692181,0.078765,0.382466
5,14,0.180447,0.441895,0.219426,0.0,0.396983,0.0,0.0,0.0,0.0,0.197766,0.078765,0.382466
5,20,0.108268,0.593982,0.056626,0.0,0.50525,0.0,0.0,0.071531,0.0,0.296649,0.078765,0.382466
5,22,0.252625,0.610198,0.092017,0.000695,0.721786,0.0,0.548695,0.357657,0.0,0.098883,0.030899,0.127489
5,23,0.036089,0.339385,0.0,0.001274,0.54134,0.0,0.710075,0.0,0.0,0.0,0.030899,0.127489


Unnamed: 0_level_0,Unnamed: 1_level_0,time_bin,total_item_price,type
acc_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,3.0,1.86778e-06,0.0
2,2,1.0,0.006393257,1.0
2,3,1.0,1.249094e-06,0.0
2,5,0.0,1.99843e-09,0.0
2,12,0.0,5.00534e-09,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,boss_monster,death,enchant_count,exp_recovery,fishing,game_money_change,npc_kill,party_exp,playtime,private_shop,quest_exp,revive,solo_exp,server,total_exp,fishing_prop
acc_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2,1,0,0.0,0.0,0.0,0.0,1.022652,0.0,0.0,2.73749,2.493091,0.0,0.0,0.0,20.0,0.0,0.0
2,2,0,0.0,0.0,0.0,0.0,-1.042002,0.0,0.0,3.405864,3.188761,0.0,0.0,0.0,20.0,0.0,0.0
2,3,0,0.0,0.0,0.0,0.0,0.450366,0.0,0.0,3.412974,3.223522,0.0,0.0,0.0,20.0,0.0,0.0
2,4,0,0.0,0.0,0.0,0.0,-0.007215,0.0,0.0,3.412974,3.171419,0.0,0.0,0.0,20.0,0.0,0.0
2,5,0,0.0,0.0,0.0,0.0,0.015365,0.0,0.0,3.412974,3.175172,0.0,0.0,0.0,20.0,0.0,0.0


In [108]:
df = combat.join(pledge).join(trade).join(activity)

In [109]:
df.isnull().sum()

etc_cnt                             0
num_opponent                        0
pledge_cnt                          0
random_attacker_cnt                 0
random_defender_cnt                 0
same_pledge_cnt                     0
temp_cnt                            0
isMajorClass                        0
combat_char_cnt                196818
combat_play_time               196818
p_etc_cnt                      196818
non_combat_play_time           196818
play_char_cnt                  196818
pledge_combat_cnt              196818
p_random_attacker_cnt          196818
p_random_defender_cnt          196818
p_same_pledge_cnt              196818
p_temp_cnt                     196818
avg_play_rate_per_pledge       196818
total_combat_cnt_per_pledge    196818
time_bin                       725934
total_item_price               725934
type                           725934
boss_monster                        0
death                               0
enchant_count                       0
exp_recovery

In [110]:
df[100:200]

Unnamed: 0_level_0,Unnamed: 1_level_0,etc_cnt,num_opponent,pledge_cnt,random_attacker_cnt,random_defender_cnt,same_pledge_cnt,temp_cnt,isMajorClass,combat_char_cnt,combat_play_time,p_etc_cnt,non_combat_play_time,play_char_cnt,pledge_combat_cnt,p_random_attacker_cnt,p_random_defender_cnt,p_same_pledge_cnt,p_temp_cnt,avg_play_rate_per_pledge,total_combat_cnt_per_pledge,time_bin,total_item_price,type,boss_monster,death,enchant_count,exp_recovery,fishing,game_money_change,npc_kill,party_exp,playtime,private_shop,quest_exp,revive,solo_exp,server,total_exp,fishing_prop
acc_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
20,7,0.0,0.098129,0.256737,0.0,0.0,0.0,0.0,0.0,0.938322,2.940139,6.632335,0.0,1.948823,0.411607,0.0,0.071531,0.795717,0.791064,0.212793,181.757071,,,,0,0.0,0.0,0.0,0.0,0.020187,3.628471,0.005748,0.729997,0.0,0.000433,0.0,0.025144,6.0,0.031325,0.0
20,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.154858,2.282219,1.762488,0.0,1.804466,0.030962,0.0,1.573689,0.795717,1.681012,0.212793,181.757071,,,,0,0.0,0.0,0.0,0.0,-0.000473,0.001687,0.0,0.113766,0.0,0.000216,0.0,0.0001,6.0,0.000316,0.0
20,9,6.193634,2.453219,0.128368,0.0,0.0,0.0,0.0,0.0,0.830054,2.655311,1.231618,0.0,1.840555,0.163914,1.452427,0.071531,0.0,1.087714,0.212793,181.757071,,,,1,0.0,0.0,0.0,0.0,-0.033363,0.503701,0.0,0.663634,0.0,0.000866,0.0,0.004258,6.0,0.005124,0.0
20,10,0.112612,0.098129,0.0,0.0,0.0,0.0,0.0,0.0,0.974412,2.763844,1.649236,0.0,1.840555,0.315079,0.0,0.071531,0.0,0.593298,0.212793,181.757071,,,,0,0.0,0.0,0.0,0.562272,0.03435,0.346484,0.002822,1.17084,0.0,0.000216,0.0,0.005207,6.0,0.008246,0.480229
20,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.974412,2.784926,1.543062,0.0,1.948823,0.333292,0.0,0.071531,0.0,1.087714,0.212793,181.757071,0.0,1.126127e-08,0.0,0,0.0,0.0,0.0,0.171834,-0.359197,0.25573,0.0,1.512137,0.575307,0.066535,0.0,0.008341,6.0,0.074875,0.113636
20,12,0.0,0.098129,0.128368,0.0,0.0,0.0,0.0,0.0,0.938322,2.79176,1.677549,0.0,1.876645,0.198518,0.0,0.143063,0.0,0.692181,0.212793,181.757071,,,,0,0.49074,0.0,0.0,0.0,3.806138,0.142372,0.014041,2.874957,2.37893,0.000216,0.493639,0.000407,6.0,0.014664,0.0
20,13,3.941403,2.649476,0.513474,0.0,0.0,0.0,0.0,0.0,0.721786,2.622646,2.102245,0.0,1.984913,0.22948,0.0,0.0,0.0,1.582129,0.212793,181.757071,3.0,1.9816e-06,0.0,0,0.0,0.0,0.0,0.0,-3.855009,0.193316,0.001684,0.407661,0.0,0.000216,0.0,0.001697,6.0,0.003597,0.0
20,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.974412,2.515271,2.28628,0.0,2.057091,0.265905,0.387314,0.0,0.0,1.087714,0.212793,181.757071,,,,0,0.0,0.0,0.0,0.0,0.019751,0.185556,0.003793,0.360258,0.0,0.0,0.0,0.000928,6.0,0.004721,0.0
20,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.938322,1.83071,1.033427,0.0,1.804466,0.109276,0.0,0.286125,0.0,0.98883,0.212793,181.757071,,,,0,0.0,0.0,0.0,0.0,7e-05,0.075909,0.0,0.161168,0.0,0.000433,0.0,0.000893,6.0,0.001326,0.0
20,16,1.013504,0.981288,0.064184,0.0,0.0,0.0,0.0,0.0,1.227037,2.422606,1.543062,0.0,2.309716,0.60284,2.582092,0.071531,0.0,0.395532,0.212793,181.757071,0.0,0.007032583,1.0,0,0.0,0.0,0.0,0.0,0.016907,0.656532,0.0,1.38178,0.0,0.001082,0.0,0.006394,19.0,0.007476,0.0


In [111]:
# '196818' 개의 NaN 값 생성 이유:
# 해당 유저가 '혈맹 전투 관련 활동'을 안한 날짜 존재.
# 평균으로 채워줄지, 0으로 채워줄지 고민
display(df.loc[38].total_combat_cnt_per_pledge)
display(pledge.loc[38].total_combat_cnt_per_pledge)

day
1          NaN
2          NaN
3          NaN
4          NaN
5          NaN
6          NaN
7          NaN
8          NaN
9          NaN
10         NaN
11         NaN
12         NaN
13         NaN
14         NaN
15         NaN
16         NaN
18         NaN
19         NaN
20         NaN
21    5.210647
22    5.210647
23    5.210647
24    5.210647
25    5.210647
26    5.210647
27    5.210647
28    5.210647
Name: total_combat_cnt_per_pledge, dtype: float64

day
21    5.210647
22    5.210647
23    5.210647
24    5.210647
25    5.210647
26    5.210647
27    5.210647
28    5.210647
Name: total_combat_cnt_per_pledge, dtype: float64

In [112]:
to_be_dropped = ['play_char_cnt', 'pledge_combat_cnt']
to_be_filled0 = ['combat_char_cnt', 'combat_play_time', 
                 'p_etc_cnt', 'non_combat_play_time', 
                 'p_random_attacker_cnt', 'p_random_defender_cnt', 
                 'p_same_pledge_cnt', 'p_temp_cnt', 
                 'avg_play_rate_per_pledge', 'total_combat_cnt_per_pledge']
df = df.drop(to_be_dropped, axis=1) # 이 컬럼은 혈맹 단위로 대체됨
df[to_be_filled0] = df[to_be_filled0].fillna(0) # 이 컬럼은 혈맹 관련 활동을 안한 날 --> 0으로.

In [113]:
df.isnull().sum()

etc_cnt                             0
num_opponent                        0
pledge_cnt                          0
random_attacker_cnt                 0
random_defender_cnt                 0
same_pledge_cnt                     0
temp_cnt                            0
isMajorClass                        0
combat_char_cnt                     0
combat_play_time                    0
p_etc_cnt                           0
non_combat_play_time                0
p_random_attacker_cnt               0
p_random_defender_cnt               0
p_same_pledge_cnt                   0
p_temp_cnt                          0
avg_play_rate_per_pledge            0
total_combat_cnt_per_pledge         0
time_bin                       725934
total_item_price               725934
type                           725934
boss_monster                        0
death                               0
enchant_count                       0
exp_recovery                        0
fishing                             0
game_money_c