In [1]:
import orjson
import os
import pandas as pd
from tqdm import tqdm
import orjson

In [2]:
tier = 'diamond'
output_dir = "/home/piddle/hdd/matches"
target_dir = os.path.join(output_dir, tier)

In [None]:
path = os.path.join(target_dir, os.listdir(target_dir)[6])
file = orjson.loads(open(path, 'rb').read())
file

In [4]:
all_column=['match_id', 'game_length_second', 'summoner_id', 'summoner_level', 'champion_id', 'team_key', 'position',
             'trinket_item', 'item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'rune_0', 'rune_1', 'spell_0', 'spell_1',
             'champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
             'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
             'vision_score', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
             'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal', 'result']
    
cols1 = ['champion_id', 'team_key', 'position', 'trinket_item']

stat_cols = ['champion_level', 'damage_self_mitigated', 'damage_dealt_to_objectives', 'damage_dealt_to_turrets',
            'total_damage_taken', 'total_damage_dealt', 'total_damage_dealt_to_champions', 'time_ccing_others',
            'time_ccing_others', 'vision_wards_bought_in_game', 'sight_wards_bought_in_game', 'ward_kill', 'ward_place',
            'turret_kill', 'kill', 'death', 'assist', 'neutral_minion_kill', 'gold_earned', 'total_heal']

In [5]:
def parse_match(file_iter) -> pd.DataFrame:
    data_chunk = []
    for n, file in enumerate(tqdm(file_iter)):
        with open(file.path, 'rb') as f:
            try:
                json_data = orjson.loads(f.read())
            except Exception as e:
                print('Error reading {}: {}'.format(file.path, e))
                continue

        for match in json_data:
            for participant in match['participants']:
                data= {}
                data['match_id'] = match['id']
                data['game_length_second'] = match['game_length_second']

                data['summoner_id'] = participant['summoner']['summoner_id']
                data['summoner_level'] = participant['summoner']['level']

                for col in cols1:
                    data[col] = participant[col]

                for i, item in enumerate(participant['items']):
                    data[f'item_{i}'] = item

                data['rune_0'] = participant['rune']["primary_rune_id"]
                data['rune_1'] = participant['rune']["secondary_page_id"]
                data['spell_0'] = participant['spells'][0]
                data['spell_1'] = participant['spells'][1]

                stats = participant['stats']
                for col in stat_cols:
                    data[col] = stats[col] / match['game_length_second']

                data['vision_score'] = stats['vision_score']
                data['result'] = stats['result']

                data_chunk.append(data)

        if n % 20000 == 0:
            df = pd.DataFrame(data_chunk, columns=all_column)
            data_chunk = []
            yield df

    else:
        df = pd.DataFrame(data_chunk, columns=all_column)
        yield df

In [None]:
file_iter = os.scandir(target_dir)
participants = pd.DataFrame(columns=all_column)
participants.to_csv(f'../data/{tier}_matches.csv', mode='w', index=False)

for df in parse_match(file_iter):
    pass
    df.to_csv(f'../data/{tier}_matches.csv', mode='a', index=False, header=False)

In [None]:
df = pd.read_csv(f'../data/{tier}_matches.csv')
df

In [23]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch import nn

import itertools

In [24]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.combinations = [list(element) for element in itertools.combinations(range(5), 2)]
        self.combinations_len = len(self.combinations)

    def __getitem__(self, idx):
        df_idx = idx // self.combinations_len
        combination_idx = idx % self.combinations_len

        x = self.df.iloc[df_idx, self.combinations[combination_idx]]
        x = torch.tensor(x, dtype=torch.int32, device='cuda')

        y = self.df.iloc[df_idx, -1]
        y = torch.tensor(y, dtype=torch.float64, device='cuda')

        return x, y
        
    def __len__(self):
        return len(self.combinations) * len(self.df)

In [26]:
class SimilarityModel(nn.Module):
    def __init__(self, config):
        super(SimilarityModel, self).__init__()
        self.embedding = nn.Embedding(config['n_layers'], config['emb_size'])


    def forward(self, input):
        input = input.transpose_(0, 1)
        embedded = self.embedding(input)
        A = embedded[0]
        B = embedded[1]
        output = F.cosine_similarity(A, B, dim=1)     

        return output

In [31]:
df = pd.read_csv(f'../data/{tier}_matches.csv')

config = {}
config['n_layers'] = int(df.drop(columns='result').max().max() + 1)
config['emb_size'] = 3

dataset = MyDataset(df)
loader = DataLoader(dataset, batch_size=5, shuffle=True)

model = SimilarityModel(config).to('cuda')

optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
loss_fun = nn.CrossEntropyLoss()

model.train()
for x, y in tqdm(loader):
    optimizer.zero_grad()
    output = model(x)

    # print(output, y)
    loss = loss_fun(output, y)
    loss.backward()
    optimizer.step()

  x = torch.tensor(x, dtype=torch.int32, device='cuda')
100%|██████████| 2521284/2521284 [2:06:43<00:00, 331.59it/s]  


In [33]:
import joblib

joblib.dump(model, 'cossim.joblib')

['cossim.joblib']