In [None]:
%pip install aiofiles numpy pandas orjson tqdm

In [1]:
from tqdm import tqdm
import pandas as pd
import orjson
import os
import numpy as np
import pickle
import asyncio

In [2]:
tier = 'diamond'
stored_dir = "/home/piddle/hdd/matches"
stored_dir = os.path.join(stored_dir, tier)

# json파일에 필요한 column 뽑아서 csv로 저장

In [None]:
prepro_cols1 = ['champion_id', 'team_key', 'position', 'trinket_item']

prepro_cols2 = ['champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                'time_ccing_others', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal']

recent_10_game_by_summoner_col = ['summoner_id', 'match_id', 'team_key', 'position']
    
recent_10_game_by_match_col = [ 'match_id', 'team_key', 'position', 'game_length_second', 'summoner_level', 'champion_id',
                                'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1',
                                'champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                                'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                                'vision_score', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                                'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal', 'result']

In [None]:
def parse_match_by_summoner(participant) -> dict:
    match_by_summoner_id = {}

    match_by_summoner_id['summoner_id'] = participant['summoner']['summoner_id']
    match_by_summoner_id['team_key'] = participant['team_key']
    match_by_summoner_id['position'] = participant['position']

    return match_by_summoner_id


def parse_match_by_match(participant) -> dict:
    match_by_match_id = {}
    match_by_match_id['summoner_level'] = participant['summoner']['level']

    for col in prepro_cols1:
        match_by_match_id[col] = participant[col]

    for i, item in enumerate(participant['items']):
        match_by_match_id[f'item_{i}'] = item

    match_by_match_id['rune_0'] = participant['rune']["primary_rune_id"]
    match_by_match_id['rune_1'] = participant['rune']["secondary_page_id"]
    match_by_match_id['spell_0'] = participant['spells'][0]
    match_by_match_id['spell_1'] = participant['spells'][1]

    stats = participant['stats']
    for col in prepro_cols2:
        match_by_match_id[col] = stats[col]

    match_by_match_id['vision_score'] = stats['vision_score']
    match_by_match_id['result'] = stats['result']

    return match_by_match_id


def gen_json(file_gen):
    for file in tqdm(file_gen, mininterval=2):
        with open(file.path, 'rb') as f:
            try:
                json_data = orjson.loads(f.read())
            except Exception as e:
                print('Error reading {}: {}'.format(file.path, e))
                continue

        yield json_data


### match 데이터를 읽어서 match_id, summoner_id column을 추출하고 recent_10_game_by_summoner_col, recent_10_game_by_match_col
async def gen_parse_json(json_data_gen):
    for json_data in json_data_gen:
        ### 정상적인 데이터가 아니면 continue
        if not 'participants' in json_data[0]:
            continue
        
        for match in json_data:
            for participant in match['participants']:

                match_by_summoner_id = parse_match_by_summoner(participant)
                match_by_summoner_id['match_id'] = match['id']

                match_by_match_id = parse_match_by_match(participant)
                match_by_match_id['match_id'] = match['id']
                match_by_match_id['game_length_second'] = match['game_length_second']

                yield match_by_match_id, match_by_summoner_id


def save_to_csv(dump: dict):
    match_df = pd.DataFrame(dump['match'], columns=recent_10_game_by_match_col)
    summoner_df = pd.DataFrame(dump['summoner_id'], columns=recent_10_game_by_summoner_col)

    match_df.to_csv(f'{tier}_match_by_match.csv', mode='a', index=False, header=False)
    summoner_df.to_csv(f'{tier}_match_by_summoner.csv', mode='a', index=False, header=False)


async def collect_data(parsed_json_data_gen, dump: dict):
    dump['match'], dump['summoner_id'] = [], []
    async for match_by_match_id, match_by_summoner_id in parsed_json_data_gen:
        dump['match'].append(match_by_match_id)
        dump['summoner_id'].append(match_by_summoner_id)

        ### 데이터를 20000개씩 끊어서 저장
        if len(dump['match']) % 20000 == 0:
            save_to_csv(dump)
            dump['match'], dump['summoner_id'] = [], []
        
    ### 나머지 데이터 저장
    save_to_csv(dump)

In [None]:
async def parse():
    file_gen = os.scandir(stored_dir)
    ### 빈 파일을 생성
    pd.DataFrame(columns=recent_10_game_by_match_col).to_csv(f'{tier}_match_by_match.csv', mode='w', index=False)
    pd.DataFrame(columns=recent_10_game_by_summoner_col).to_csv(f'{tier}_match_by_summoner.csv', mode='w', index=False)

    json_data_gen = gen_json(file_gen)

    dump = {}
    T = [asyncio.create_task(collect_data(gen_parse_json(json_data_gen), dump)) for _ in range(2)]
    await asyncio.gather(*T)


if __name__ == '__main__':
    asyncio.run(parse())

# 전처리

In [3]:
def load_dfs():
    match_by_summoner_df = pd.read_csv(f'{tier}_match_by_summoner.csv')
    match_by_match_df = pd.read_csv(f'{tier}_match_by_match.csv')

    return match_by_match_df, match_by_summoner_df


def prepro_match_by_match_df(match_by_match_df: pd.DataFrame):
    match_by_match_df.sort_values(by=['match_id', 'team_key', 'position'], inplace=True)
    match_by_match_df.drop_duplicates(['match_id', 'team_key', 'position'], inplace=True)
    ### 매치 시간이 0이면 dorp
    match_by_match_df = match_by_match_df[match_by_match_df['game_length_second'] != 0]

    ### 1개의 경기에 참여자가 10명이 안되면 drop
    match_by_match_df = match_by_match_df.groupby('match_id').filter(lambda x: len(x) == 10)

    ### 경기 결과가 UNKNOWN이면 drop
    match_by_match_df = match_by_match_df[match_by_match_df['result'] != 'UNKNOWN']

    match_by_match_df.reset_index(drop=True, inplace=True)

    return match_by_match_df


def prepro_match_by_summoner_df(match_by_summoner_df: pd.DataFrame):
    match_by_summoner_df.drop_duplicates(['match_id', 'summoner_id'], inplace=True)
    match_by_summoner_df = match_by_summoner_df[['summoner_id', 'match_id', 'team_key', 'position']]
    match_by_summoner_df = match_by_summoner_df[match_by_summoner_df.notna().all(axis=1)]
    match_by_summoner_df.reset_index(drop=True, inplace=True)

    return match_by_summoner_df

In [4]:
match_by_match_df, match_by_summoner_df = load_dfs()
match_by_match_df = prepro_match_by_match_df(match_by_match_df)
match_by_summoner_df = prepro_match_by_summoner_df(match_by_summoner_df)

In [5]:
def set_intersection(match_by_match_df: pd.DataFrame, match_by_summoner_df: pd.DataFrame):
    ### summoner_df의 match_id와 match_df의 match_id의 교집합을 구하고, 교집합에 해당하는 데이터만 추출
    intersec = np.intersect1d(match_by_summoner_df['match_id'].unique(), match_by_match_df['match_id'].unique())
    match_by_summoner_df = match_by_summoner_df[match_by_summoner_df['match_id'].isin(intersec)]
    match_by_summoner_df.reset_index(drop=True, inplace=True)
    match_by_match_df = match_by_match_df[match_by_match_df['match_id'].isin(intersec)]
    match_by_match_df.reset_index(drop=True, inplace=True)

    return match_by_match_df, match_by_summoner_df, intersec


def set_num_summoner_id_equal_to_10(df: pd.DataFrame):
    temp = df.groupby('summoner_id').size()
    valid_indices = temp[temp >= 10].index
    df = df[df.set_index(['summoner_id']).index.isin(valid_indices)]
    df.reset_index(drop=True, inplace=True)

    ### 10개 이상 가지고 있는 데이터는 10개만 남김
    df = df.groupby('summoner_id').head(10)
    df.reset_index(drop=True, inplace=True)

    return df


def interpro_match(match_by_match_df: pd.DataFrame, match_by_summoner_df: pd.DataFrame):
    prev_match_len = len(match_by_match_df)
    prev_summoner_len = len(match_by_summoner_df)

    while True:
        print(len(match_by_match_df), len(match_by_summoner_df))
        ### summoner_df의 match_id와 match_df의 match_id의 교집합을 구하고, 교집합에 해당하는 데이터만 추출
        match_by_match_df, match_by_summoner_df, intersec = set_intersection(match_by_match_df, match_by_summoner_df)

        ### match 개수가 2개 이상인 summoner_id만 추출
        match_by_summoner_df = set_num_summoner_id_equal_to_10(match_by_summoner_df)

        ### 데이터들의 변화가 있으면 반복
        if len(match_by_match_df) == prev_match_len and len(match_by_summoner_df) == prev_summoner_len:
            break
        else:
            prev_match_len = len(match_by_match_df)
            prev_summoner_len = len(match_by_summoner_df)

    match_intersec = intersec

    return match_by_match_df, match_by_summoner_df, match_intersec


def postpro_match_by_match_df(match_by_match_df: pd.DataFrame):
    return match_by_match_df


def postpro_match_by_summoner_df(match_by_summoner_df: pd.DataFrame):
    position_mapping = {'ADC': 0, 'JUNGLE': 1, 'MID': 2, 'SUPPORT': 3, 'TOP': 4}
    def calculate_index(row):
        team_offset = 0 if row['team_key'] == 'BLUE' else 5
        position_index = position_mapping[row['position']]
        return team_offset + position_index

    match_by_summoner_df['position_index'] = match_by_summoner_df.apply(calculate_index, axis=1)
    match_by_summoner_df = match_by_summoner_df[['summoner_id', 'match_id', 'position_index']]

    return match_by_summoner_df


In [6]:
match_by_match_df, match_by_summoner_df, match_intersec = interpro_match(match_by_match_df, match_by_summoner_df)
match_by_match_df = postpro_match_by_match_df(match_by_match_df)
match_by_summoner_df = postpro_match_by_summoner_df(match_by_summoner_df)


4779570 4824920
4779570 1275820
3608570 1275820


In [7]:
def get_cate_to_idx(cate_col: dict, df: pd.DataFrame):
    unique = {}
    for key in cate_col.keys():
        unique[key] = np.array([])
        for col in cate_col[key]:
            unique[key] = np.concatenate((unique[key], df[col].unique()))

    cate_to_index = {}
    for key in cate_col.keys():
        cate_to_index[key] = {}
        ### mask index 0을 생성.
        for i, u in enumerate(unique[key], start=1):
            cate_to_index[key][u] = i

    return cate_to_index


def map_cate_to_idx(cate_col: dict, to_idx: dict, df: pd.DataFrame):
    for key in cate_col.keys():
        for col in cate_col[key]:
            df[col] = df[col].map(to_idx[key])

    return df
        

def indexing(cate_col: list, match_id_to_index: dict, df: pd.DataFrame):
    for col in cate_col:
        df[col] = df[col].apply(lambda x: col + '.' + str(x))

    indexing_cate_col = {}
    indexing_cate_col['summoner_id'] = list(filter(lambda x: x == 'summoner_id', cate_col))
    indexing_cate_col['other'] = list(filter(lambda x: x != 'summoner_id' and x != 'match_id', cate_col))

    cate_to_index = get_cate_to_idx(indexing_cate_col, df)
    cate_to_index.update({'match_id': match_id_to_index})
    indexing_cate_col['match_id'] = ['match_id']
    df = map_cate_to_idx(indexing_cate_col, cate_to_index, df)

    return df, cate_to_index


def indexing_dfs(match_by_match_df: pd.DataFrame, match_by_summoner_df: pd.DataFrame, match_intersec: np.ndarray):
    ### match_id index 생성
    match_id_to_index = {}
    match_intersec_func = np.vectorize(lambda x: 'match_id' + '.' + str(x))
    match_intersec_mod = match_intersec_func(match_intersec)
    for i, u in enumerate(match_intersec_mod, start=1):
        match_id_to_index[u] = i

    ### match_by_summoner_df
    cate_col = ['summoner_id', 'match_id']
    match_by_summoner_df, match_by_summoner_cate_to_index = indexing(cate_col, match_id_to_index, match_by_summoner_df)

    ### match_by_match_df
    cate_col = ['match_id', 'team_key', 'position', 'champion_id', 'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1', 'result']
    match_by_match_df, match_by_match_cate_to_index = indexing(cate_col, match_id_to_index, match_by_match_df)

    return match_by_match_df, match_by_summoner_df, match_by_summoner_cate_to_index, match_by_match_cate_to_index


In [8]:
match_by_match_df, match_by_summoner_df, match_by_summoner_cate_to_index, match_by_match_cate_to_index = indexing_dfs(match_by_match_df, match_by_summoner_df, match_intersec)

In [9]:
def save_dfs(match_by_match_df: pd.DataFrame, match_by_summoner_df: pd.DataFrame, match_by_summoner_cate_to_index: dict, match_by_match_cate_to_index: dict):
    match_by_summoner_df = match_by_summoner_df.astype(int)
    match_by_summoner_df.sort_values(by=['summoner_id', 'match_id', 'position_index'], inplace=True)
    match_by_summoner_df.reset_index(drop=True, inplace=True)
    match_by_summoner_df.to_csv(f'{tier}_match_by_summoner_mod.csv', mode='w', index=False, compression='gzip')
    
    match_by_match_df = match_by_match_df.astype(int)
    match_by_match_df.sort_values(by=['match_id', 'team_key', 'position'], inplace=True)
    match_by_match_df.reset_index(drop=True, inplace=True)
    match_by_match_df.to_csv(f'{tier}_match_by_match_mod.csv', mode='w', index=False, compression='gzip')
    
    with open(f'{tier}_summoner_to_index.pkl', 'wb') as file:
        pickle.dump(match_by_summoner_cate_to_index, file)
    with open(f'{tier}_match_to_index.pkl', 'wb') as file:
        pickle.dump(match_by_match_cate_to_index, file)

In [10]:
save_dfs(match_by_match_df, match_by_summoner_df, match_by_summoner_cate_to_index, match_by_match_cate_to_index)