In [1]:
from tqdm import tqdm
import pandas as pd
import orjson
import os
import numpy as np
import pickle

In [2]:
tier = 'diamond'
stored_dir = "/home/piddle/hdd/matches"
stored_dir = os.path.join(stored_dir, tier)

target_dir = "/home/piddle/myetc/data"

# json파일에 필요한 column 뽑아서 csv로 저장

In [3]:
prepro_cols1 = ['champion_id', 'team_key', 'position', 'trinket_item']

prepro_cols2 = ['champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                'time_ccing_others', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal']

recent_10_game_by_summoner_col = [  'summoner_id', 'match_id', 'game_length_second', 'summoner_level', 'champion_id', 'team_key', 'position',
                                    'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1',
                                    'champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                                    'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                                    'vision_score', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                                    'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal', 'result']
    
recent_10_game_by_match_col = ['match_id', 'summoner_id', 'team_key', 'result']

In [4]:
'''
match 데이터를 읽어서 match_id, summoner_id column을 추출하고 recent_10_game_by_summoner_col, recent_10_game_by_match_col
'''
def parse_match(file_iter) -> pd.DataFrame:
    match_by_match_id_chunk = []
    match_by_summoner_id_chunk = []
    for n, file in enumerate(tqdm(file_iter)):
        with open(file.path, 'rb') as f:
            try:
                json_data = orjson.loads(f.read())
            except Exception as e:
                print('Error reading {}: {}'.format(file.path, e))
                continue

        summoner_id = file.name[:-5]

        if not 'participants' in json_data[0]:
            continue
        
        for match in json_data:
            for participant in match['participants']:
                
                if match['game_length_second'] == 0:
                    continue
                
                match_by_match_id = {}

                match_by_match_id['match_id'] = match['id']
                match_by_match_id['summoner_id'] = participant['summoner']['summoner_id']
                match_by_match_id['team_key'] = participant['team_key']
                match_by_match_id['result'] = participant['stats']['result']

                match_by_match_id_chunk.append(match_by_match_id)

                if participant['summoner']['summoner_id'] == summoner_id:
                    match_by_summoner_id = {}
                    match_by_summoner_id['match_id'] = match['id']
                    match_by_summoner_id['game_length_second'] = match['game_length_second']

                    match_by_summoner_id['summoner_id'] = participant['summoner']['summoner_id']
                    match_by_summoner_id['summoner_level'] = participant['summoner']['level']

                    for col in prepro_cols1:
                        match_by_summoner_id[col] = participant[col]

                    for i, item in enumerate(participant['items']):
                        match_by_summoner_id[f'item_{i}'] = item

                    match_by_summoner_id['rune_0'] = participant['rune']["primary_rune_id"]
                    match_by_summoner_id['rune_1'] = participant['rune']["secondary_page_id"]
                    match_by_summoner_id['spell_0'] = participant['spells'][0]
                    match_by_summoner_id['spell_1'] = participant['spells'][1]

                    stats = participant['stats']
                    for col in prepro_cols2:
                        match_by_summoner_id[col] = stats[col]

                    match_by_summoner_id['vision_score'] = stats['vision_score']
                    match_by_summoner_id['result'] = stats['result']

                    match_by_summoner_id_chunk.append(match_by_summoner_id)

        ### 데이터를 20000개씩 끊어서 yield
        if n % 20000 == 0:
            match_df = pd.DataFrame(match_by_match_id_chunk, columns=recent_10_game_by_match_col)
            summoner_df = pd.DataFrame(match_by_summoner_id_chunk, columns=recent_10_game_by_summoner_col)
            
            match_by_match_id_chunk = []
            match_by_summoner_id_chunk = []
            yield match_df, summoner_df

    else:
        match_df = pd.DataFrame(match_by_match_id_chunk, columns=recent_10_game_by_match_col)
        summoner_df = pd.DataFrame(match_by_summoner_id_chunk, columns=recent_10_game_by_summoner_col)
        yield match_df, summoner_df

In [None]:
file_iter = os.scandir(stored_dir)
### 빈 파일을 생성
pd.DataFrame(columns=recent_10_game_by_match_col).to_csv(os.path.join(target_dir, f'{tier}_match_by_match.csv'), mode='w', index=False)
pd.DataFrame(columns=recent_10_game_by_summoner_col).to_csv(os.path.join(target_dir, f'{tier}_match_by_summoner.csv'), mode='w', index=False)

for match_df, summoner_df in parse_match(file_iter):
    match_df.to_csv(os.path.join(target_dir, f'{tier}_match_by_match.csv'), mode='a', index=False, header=False)
    summoner_df.to_csv(os.path.join(target_dir, f'{tier}_match_by_summoner.csv'), mode='a', index=False, header=False)

# match_by_summoner_df 전처리

In [None]:
match_by_summoner_df = pd.read_csv(os.path.join(target_dir, f'{tier}_match_by_summoner.csv'))

In [11]:
### 중복된 데이터 제거, WIN, LOSE 데이터 외 필터링
match_by_summoner_df = match_by_summoner_df.drop_duplicates(['match_id', 'summoner_id'])
match_by_summoner_df = match_by_summoner_df[match_by_summoner_df['result'].isin(['WIN', 'LOSE'])]

In [12]:
cate_col = ['match_id', 'summoner_id', 'champion_id', 'team_key', 'position', 'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1', 'result']

In [13]:
### 데이터에 column 이름을 앞에 붙여서 coulmn마다 유니크한 값으로 변경
for col in cate_col:
    match_by_summoner_df[col] = match_by_summoner_df[col].apply(lambda x: col + '.' + str(x))
match_by_summoner_df

Unnamed: 0,summoner_id,match_id,game_length_second,summoner_level,champion_id,team_key,position,trinket_item,item_0,item_1,...,ward_kill,ward_place,turret_kill,kill,death,assist,neutral_minion_kill,gold_earned,total_heal,result
0,summoner_id.Pki8T7HBL-aeecO7mmcMIFeKWWRc-zPfmj...,match_id.CNrwP4AK2Xd8w3fsBE51tSQaA30xjKBw8Axth...,1560,266,champion_id.13,team_key.BLUE,position.MID,trinket_item.3363,item_0.4630,item_1.6657,...,0,9,1,1,4,3,12,8334,2751,result.LOSE
1,summoner_id.Pki8T7HBL-aeecO7mmcMIFeKWWRc-zPfmj...,match_id.CNrwP4AK2Xe1SGRd-otNS5nV6TMvh7jgs2tXk...,1320,266,champion_id.268,team_key.BLUE,position.MID,trinket_item.3363,item_0.3100,item_1.3115,...,1,3,0,2,0,2,0,8892,2896,result.LOSE
2,summoner_id.Pki8T7HBL-aeecO7mmcMIFeKWWRc-zPfmj...,match_id.CNrwP4AK2Xdi2_kFJSvTtn2dBruz3jlZt1W92...,1418,266,champion_id.268,team_key.RED,position.MID,trinket_item.3363,item_0.3113,item_1.3115,...,0,6,0,2,5,0,0,7683,57,result.LOSE
3,summoner_id.Pki8T7HBL-aeecO7mmcMIFeKWWRc-zPfmj...,match_id.CNrwP4AK2Xc7lbRKnMKmbGRWJa5HoFEi1ukJg...,1558,266,champion_id.268,team_key.RED,position.MID,trinket_item.3363,item_0.1058,item_1.2420,...,2,9,1,6,1,6,0,10677,1816,result.WIN
4,summoner_id.Pki8T7HBL-aeecO7mmcMIFeKWWRc-zPfmj...,match_id.CNrwP4AK2XdtwndtdTggt5-DDndWH9WOTCTcD...,1740,266,champion_id.3,team_key.BLUE,position.MID,trinket_item.3340,item_0.3152,item_1.2055,...,0,13,0,2,6,11,0,8925,2265,result.LOSE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824671,summoner_id.ULAwYO4uGJy-JsnKTRGhPsV5pgUw4i203O...,match_id.CNrwP4AK2XcuLAUwGjRcPsYz7LHzsHWQx9rbX...,1375,83,champion_id.39,team_key.BLUE,position.TOP,trinket_item.3364,item_0.1055,item_1.3091,...,1,7,3,8,4,3,13,13239,11841,result.WIN
824672,summoner_id.ULAwYO4uGJy-JsnKTRGhPsV5pgUw4i203O...,match_id.CNrwP4AK2XcJUumytI7EkP5v8HhShIWuoki3r...,2419,83,champion_id.777,team_key.RED,position.MID,trinket_item.3364,item_0.3031,item_1.6665,...,6,7,1,15,9,5,38,20759,21222,result.WIN
824673,summoner_id.ULAwYO4uGJy-JsnKTRGhPsV5pgUw4i203O...,match_id.CNrwP4AK2XdMX2UtgEPveuUii4UNF1jpcsl8i...,1001,83,champion_id.134,team_key.RED,position.MID,trinket_item.3363,item_0.1056,item_1.3113,...,2,7,0,1,2,2,0,6358,401,result.WIN
824674,summoner_id.ULAwYO4uGJy-JsnKTRGhPsV5pgUw4i203O...,match_id.CNrwP4AK2XdJ2fmOdyFbYc5dz4Ae6E7FoDzlk...,1681,83,champion_id.39,team_key.BLUE,position.TOP,trinket_item.3364,item_0.6610,item_1.3123,...,2,9,1,7,5,1,4,14105,13533,result.LOSE


In [14]:
cate_col = {}
cate_col['match_id'] = ['match_id']
cate_col['summoner_id'] = ['summoner_id']
cate_col['other'] = ['champion_id', 'team_key', 'position', 'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1', 'result']

In [15]:
### match_id, summoner_id의 유니크 값들을 따로 저장, 나머지 column들은 합쳐서 저장
unique = {}
for key in cate_col.keys():
    unique[key] = np.array([])
    for col in cate_col[key]:
        unique[key] = np.concatenate((unique[key], match_by_summoner_df[col].unique()))

In [16]:
to_index = {}

for key in cate_col.keys():
    to_index[key] = {}
    ### mask index 0을 생성.
    for i, u in enumerate(unique[key], start=1):
        to_index[key][u] = i

### to_index 저장
with open(os.path.join(target_dir, f'{tier}_to_index.pkl'), 'wb') as file:
    pickle.dump(to_index, file)

In [17]:
### 맵핑
for key in cate_col.keys():
    for col in cate_col[key]:
        match_by_summoner_df[col] = match_by_summoner_df[col].map(to_index[key])

match_by_summoner_df = match_by_summoner_df.sort_values(by=['summoner_id', 'match_id'])

Unnamed: 0,summoner_id,match_id,game_length_second,summoner_level,champion_id,team_key,position,trinket_item,item_0,item_1,...,ward_kill,ward_place,turret_kill,kill,death,assist,neutral_minion_kill,gold_earned,total_heal,result
0,1,1,1560,266,1,168,170,176,182,450,...,0,9,1,1,4,3,12,8334,2751,1782
1,1,2,1320,266,2,168,170,176,183,451,...,1,3,0,2,0,2,0,8892,2896,1782
2,1,3,1418,266,2,169,170,176,184,451,...,0,6,0,2,5,0,0,7683,57,1782
3,1,4,1558,266,2,169,170,176,185,452,...,2,9,1,6,1,6,0,10677,1816,1783
4,1,5,1740,266,3,168,170,177,186,453,...,0,13,0,2,6,11,0,8925,2265,1782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824671,82573,461376,1375,83,14,168,172,178,194,476,...,1,7,3,8,4,3,13,13239,11841,1783
824672,82573,119956,2419,83,26,169,170,178,207,530,...,6,7,1,15,9,5,38,20759,21222,1783
824673,82573,402337,1001,83,118,169,170,176,189,558,...,2,7,0,1,2,2,0,6358,401,1783
824674,82573,41765,1681,83,14,168,172,178,261,551,...,2,9,1,7,5,1,4,14105,13533,1782


In [18]:
match_by_summoner_df = match_by_summoner_df.astype(int)
match_by_summoner_df.to_csv(os.path.join(target_dir, f'{tier}_match_by_summoner_mod.csv'), mode='w', index=False, compression='gzip')

# match_by_match_df 전처리

In [20]:
match_by_match_df = pd.read_csv(os.path.join(target_dir, f'{tier}_match_by_match.csv'))

In [21]:
match_by_match_df = match_by_match_df.drop_duplicates(['match_id', 'summoner_id'])
match_by_match_df = match_by_match_df[match_by_match_df['result'].isin(['WIN', 'LOSE'])]

In [22]:
cate_col = ['match_id', 'summoner_id', 'team_key', 'result']

In [23]:
for col in cate_col:
    match_by_match_df[col] = match_by_match_df[col].apply(lambda x: col + '.' + str(x))

In [24]:
cate_col = {}
cate_col['match_id'] = ['match_id']
cate_col['summoner_id'] = ['summoner_id']
cate_col['other'] = ['team_key', 'result']

In [25]:
### match_by_summoner에서 만든 to_index를 사용해야한다.
with open(os.path.join(target_dir, f'{tier}_to_index.pkl'), 'rb') as file:
    to_index = pickle.load(file)

### match_by_summoner의 'result' column에 할당한 인덱스는 학습에 사용되기 때문에 다른 column과 합쳐서 인덱싱하지만
### match_by_match의 'result' column에 할당한 인덱스는 학습에 사용되지 않고 결과를 예측할 때만 사용되기 때문에 따로 인덱싱한다 
to_index['other']['result.LOSE'] = 0
to_index['other']['result.WIN'] = 1

for key in cate_col.keys():
    for col in cate_col[key]:
        match_by_match_df[col] = match_by_match_df[col].map(to_index[key])

In [26]:
### match_by_match 인덱싱하면 match_by_summoner에 존재하지 않았던 값들은, match_by_match에서 nan이 됨
### 때문에 제거
notna = match_by_match_df.notna().all(axis=1)
match_by_match_df = match_by_match_df[notna]

In [27]:
### 1개의 팀에 적어도 2명 이상 summoner가 존재하는 행만 남김
temp = match_by_match_df.groupby(['match_id', 'team_key']).size()
valid_indices = temp[temp > 1].index
match_by_match_df = match_by_match_df[match_by_match_df.set_index(['match_id', 'team_key']).index.isin(valid_indices)]
match_by_match_df = match_by_match_df.sort_values(by=['match_id', 'team_key'])

In [28]:
match_by_match_df = match_by_match_df.astype(int)
match_by_match_df.to_csv(os.path.join(target_dir, f'{tier}_match_by_match_mod.csv'), mode='w', index=False, compression='gzip')