In [None]:
%pip install pymongo

In [6]:
from pymongo import MongoClient
from tqdm import tqdm
import pandas as pd
import orjson
import os
import numpy as np
import pickle
import asyncio


In [2]:
# 데이터베이스 선택
client = MongoClient("mongodb://localhost:27017/")
db = client["loldb"]
collection_match = db['diamond_match']

In [7]:
tier = 'diamond'
stored_dir = "/home/piddle/hdd/matches"
stored_dir = os.path.join(stored_dir, tier)

In [8]:
prepro_cols1 = ['champion_id', 'team_key', 'position', 'trinket_item']

prepro_cols2 = ['champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                'time_ccing_others', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal']

recent_10_game_by_summoner_col = ['summoner_id', 'match_id', 'team_key', 'position']
    
recent_10_game_by_match_col = [ 'match_id', 'team_key', 'position', 'game_length_second', 'summoner_level', 'champion_id',
                                'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'rune_0', 'rune_1', 'spell_0', 'spell_1',
                                'champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
                                'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
                                'vision_score', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
                                'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal', 'result']

In [None]:
def parse_match_by_summoner(participant) -> dict:
    match_by_summoner_id = {}

    match_by_summoner_id['summoner_id'] = participant['summoner']['summoner_id']
    match_by_summoner_id['team_key'] = participant['team_key']
    match_by_summoner_id['position'] = participant['position']

    return match_by_summoner_id


def parse_match_by_match(participant) -> dict:
    match_by_match_id = {}
    match_by_match_id['summoner_level'] = participant['summoner']['level']

    for col in prepro_cols1:
        match_by_match_id[col] = participant[col]

    for i, item in enumerate(participant['items']):
        match_by_match_id[f'item_{i}'] = item

    match_by_match_id['rune_0'] = participant['rune']["primary_rune_id"]
    match_by_match_id['rune_1'] = participant['rune']["secondary_page_id"]
    match_by_match_id['spell_0'] = participant['spells'][0]
    match_by_match_id['spell_1'] = participant['spells'][1]

    stats = participant['stats']
    for col in prepro_cols2:
        match_by_match_id[col] = stats[col]

    match_by_match_id['vision_score'] = stats['vision_score']
    match_by_match_id['result'] = stats['result']

    return match_by_match_id


def gen_json(file_gen):
    for file in tqdm(file_gen, mininterval=2):
        with open(file.path, 'rb') as f:
            try:
                json_data = orjson.loads(f.read())
            except Exception as e:
                print('Error reading {}: {}'.format(file.path, e))
                continue

        yield json_data


### match 데이터를 읽어서 match_id, summoner_id column을 추출하고 recent_10_game_by_summoner_col, recent_10_game_by_match_col
async def gen_parse_json(json_data_gen):
    for json_data in json_data_gen:
        ### 정상적인 데이터가 아니면 continue
        if not 'participants' in json_data[0]:
            continue
        
        for match in json_data:
            for participant in match['participants']:

                match_by_summoner_id = parse_match_by_summoner(participant)
                match_by_summoner_id['match_id'] = match['id']

                match_by_match_id = parse_match_by_match(participant)
                match_by_match_id['match_id'] = match['id']
                match_by_match_id['game_length_second'] = match['game_length_second']

                yield match_by_match_id, match_by_summoner_id


def save_to_csv(dump: dict):
    match_df = pd.DataFrame(dump['match'], columns=recent_10_game_by_match_col)
    summoner_df = pd.DataFrame(dump['summoner_id'], columns=recent_10_game_by_summoner_col)

    match_df.to_csv(f'{tier}_match_by_match.csv', mode='a', index=False, header=False)
    summoner_df.to_csv(f'{tier}_match_by_summoner.csv', mode='a', index=False, header=False)


async def collect_data(parsed_json_data_gen, dump: dict):
    dump['match'], dump['summoner_id'] = [], []
    async for match_by_match_id, match_by_summoner_id in parsed_json_data_gen:
        dump['match'].append(match_by_match_id)
        dump['summoner_id'].append(match_by_summoner_id)

        ### 데이터를 20000개씩 끊어서 저장
        if len(dump['match']) % 20000 == 0:
            save_to_csv(dump)
            dump['match'], dump['summoner_id'] = [], []
        
    ### 나머지 데이터 저장
    save_to_csv(dump)


async def parse():
    file_gen = os.scandir(stored_dir)
    json_data_gen = gen_json(file_gen)

    dump = {}
    T = [asyncio.create_task(collect_data(gen_parse_json(json_data_gen), dump)) for _ in range(2)]
    await asyncio.gather(*T)


In [None]:
if __name__ == '__main__':
    asyncio.run(parse())