In [1]:
from tqdm import tqdm
import asyncio
from pymongo import MongoClient
from src.MongoDB_Controller import MongoDBController
import pandas as pd
import numpy as np

In [2]:
batch_size = 100000
client = MongoClient("mongodb://localhost:27017/")
mongo = MongoDBController(client, batch_size)

In [3]:
cursor = mongo['riot_match_modv1'].find({}, {'_id': 1, 'summonerId': 1, 'matchId': 1})

df = pd.DataFrame(list(cursor))
df

Unnamed: 0,_id,summonerId,matchId
0,6600207cceb0e0fccd73c0a8,194651,91774
1,6600207cceb0e0fccd73c0a9,284964,91774
2,6600207cceb0e0fccd73c0aa,33276,68435
3,6600207cceb0e0fccd73c0ab,196080,106328
4,6600207cceb0e0fccd73c0ac,139435,88103
...,...,...,...
1320225,660020efceb0e0fccd87e5c9,51880,106974
1320226,660020efceb0e0fccd87e5ca,115715,106657
1320227,660020efceb0e0fccd87e5cb,275194,106657
1320228,660020efceb0e0fccd87e5cc,267620,106657


In [4]:
temp = df.groupby('summonerId').size()
use_index = temp[temp > 1].index
df = df[df.set_index(['summonerId']).index.isin(use_index)]
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,_id,summonerId,matchId
0,6600207cceb0e0fccd73c0a8,194651,91774
1,6600207cceb0e0fccd73c0a9,284964,91774
2,6600207cceb0e0fccd73c0aa,33276,68435
3,6600207cceb0e0fccd73c0ab,196080,106328
4,6600207cceb0e0fccd73c0ac,139435,88103
...,...,...,...
1163185,660020efceb0e0fccd87e5c9,51880,106974
1163186,660020efceb0e0fccd87e5ca,115715,106657
1163187,660020efceb0e0fccd87e5cb,275194,106657
1163188,660020efceb0e0fccd87e5cc,267620,106657


In [5]:
shuffled_indices = np.random.permutation(df.index)
shuffled_df = df.loc[shuffled_indices].reset_index(drop=True)

point = len(shuffled_df) // 6
train_df = shuffled_df.iloc[:point]
valid_df = shuffled_df.iloc[point:]

### 데이터 leakage가 발생하지 않도록 matchId와 summonerId가 train과 valid에 겹치지 않도록 해보려 했지만 그렇게 되진 않는다. 

In [6]:
# train_match_unique = train_df['matchId'].unique()
# valid_match_bool = valid_df['matchId'].isin(train_match_unique)
# train_df = pd.concat([train_df, valid_df[valid_match_bool]], axis=0)

train_summoner_unique = train_df['summonerId'].unique()
valid_summoner_bool = valid_df['summonerId'].isin(train_summoner_unique)
train_df = pd.concat([train_df, valid_df[valid_summoner_bool]], axis=0)

train_df.drop_duplicates(['_id'], inplace=True)
train_df.sort_values(by=['summonerId', 'matchId'], inplace=True)
train_df.reset_index(drop=True, inplace=True)

train_df

Unnamed: 0,_id,summonerId,matchId
0,660020c3ceb0e0fccd80741b,3,3434
1,660020deceb0e0fccd855091,3,4892
2,660020c4ceb0e0fccd815fe5,3,40094
3,66002085ceb0e0fccd7631fc,3,99986
4,660020deceb0e0fccd85c790,4,7051
...,...,...,...
908724,660020a8ceb0e0fccd7c09b3,323690,2838
908725,660020a8ceb0e0fccd7c1680,323690,55635
908726,660020c3ceb0e0fccd80d82a,323690,110457
908727,6600207cceb0e0fccd752016,323691,40017


### 위 코드와 아래 코드를 여러번 실행해서 데이터 수에 변화가 없을 때까지 해야 leakage가 발생하지 않는다.

In [7]:
valid_index = valid_df['_id'].isin(train_df['_id'])
valid_df = valid_df[~valid_index]
valid_df.sort_values(by=['summonerId', 'matchId'], inplace=True)
valid_df.reset_index(drop=True, inplace=True)

valid_df

Unnamed: 0,_id,summonerId,matchId
0,660020a9ceb0e0fccd7ccc42,1,41519
1,660020b2ceb0e0fccd7e4718,1,63037
2,660020d5ceb0e0fccd836ada,6,40203
3,6600208eceb0e0fccd7737c5,6,113504
4,660020b1ceb0e0fccd7d0d29,13,26820
...,...,...,...
254456,660020ccceb0e0fccd821804,323692,25152
254457,66002085ceb0e0fccd76a58e,323692,61983
254458,660020baceb0e0fccd7e8b45,323692,112752
254459,6600208eceb0e0fccd77e83f,323694,9758


In [8]:
ids = []
for i in tqdm(train_df['_id']):
    ids.append(i)

    if len(ids) == batch_size:
        datas = mongo['riot_match_modv1'].find({'_id': {'$in': ids}}, {'_id': 0})
        await mongo.save_to_mongo('riot_match_modv1_SAS_train', datas)
        ids = []

datas = mongo['riot_match_modv1'].find({'_id': {'$in': ids}}, {'_id': 0})
await mongo.save_to_mongo('riot_match_modv1_SAS_train', datas)


ids = []
for i in tqdm(valid_df['_id']):
    ids.append(i)

    if len(ids) == batch_size:
        datas = mongo['riot_match_modv1'].find({'_id': {'$in': ids}}, {'_id': 0})
        await mongo.save_to_mongo('riot_match_modv1_SAS_valid', datas)
        ids = []

datas = mongo['riot_match_modv1'].find({'_id': {'$in': ids}}, {'_id': 0})
await mongo.save_to_mongo('riot_match_modv1_SAS_valid', datas)


  0%|          | 0/908729 [00:00<?, ?it/s]

100%|██████████| 908729/908729 [01:13<00:00, 12427.06it/s]


InvalidOperation: No operations to execute