In [1]:
import json, pickle
import warnings
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as pyplot

from tqdm import tqdm

warnings.filterwarnings("ignore")

In [63]:
# Load data from pickle

tournaments = pickle.load(open('data/tournaments.pkl', 'rb'))
results = pickle.load(open('data/results.pkl', 'rb'))
players = pickle.load(open('data/players.pkl', 'rb'))

In [71]:
def hasValidProperties(tournament_info):
    for team in tournament_info:
        team_mask = team.get('mask')
        team_members = [player['player']['id'] for player in team['teamMembers']]
        if team_mask is None or len(team_members) == 0:
            return False
    return True

In [72]:
train_tournaments = {}
test_tournaments = {}
for tournament_id, tournament_info in tqdm(tournaments.items()):
    if hasValidProperties(results[tournament_id]):
        if tournament_info["dateStart"][:4] == "2019":
            train_tournaments[tournament_id] = tournament_info
        elif tournament_info["dateStart"][:4] == "2020":
            test_tournaments[tournament_id] = tournament_info

100%|██████████| 5528/5528 [00:00<00:00, 10114.83it/s]


In [73]:
print(len(train_tournaments))
print(len(test_tournaments))

657
386


# 2

In [None]:
player_ids = set()
player_ratings = {}
total_questions = 0
train_data = []

for tournament_id, tournament_info in tqdm(train_tournaments):
    tournament_data = {}
    tournament_data['id'] = tournament_id

total_players = len(player_ids)

In [54]:
tournaments_train

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
3921,4772,Синхрон северных стран. Зимний выпуск,2019-01-05 19:00:00+03:00,2019-01-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 28379, 'name': 'Константин', 'patronym...",{'dateRequestsAllowedTo': '2019-01-09T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4115,4973,Балтийский Берег. 3 игра,2019-01-25 19:05:00+03:00,2019-01-29T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-01-28T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4116,4974,Балтийский Берег. 4 игра,2019-03-01 19:05:00+03:00,2019-03-05T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-03-04T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4117,4975,Балтийский Берег. 5 игра,2019-04-05 19:05:00+03:00,2019-04-09T19:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 23030, 'name': 'Марина', 'patronymic':...",{'dateRequestsAllowedTo': '2019-04-08T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
4128,4986,ОВСЧ. 6 этап,2019-02-15 20:00:00+03:00,2019-02-19T20:00:00+03:00,"{'id': 3, 'name': 'Синхрон'}",/seasons/52,"[{'id': 59140, 'name': 'Борис', 'patronymic': ...",{'dateRequestsAllowedTo': '2019-02-19T23:59:59...,"{'1': 12, '2': 12, '3': 12}"
...,...,...,...,...,...,...,...,...,...
5271,6191,Всеармянский Интеллектуальный Фестиваль,2019-12-22 12:00:00+03:00,2019-12-22T16:00:00+03:00,"{'id': 2, 'name': 'Обычный'}",/seasons/53,"[{'id': 19981, 'name': 'Сейран', 'patronymic':...",,"{'1': 12, '2': 12, '3': 12}"
5326,6249,Школьный синхрон-lite. Сезон 3,2019-09-01 00:05:00+03:00,2020-04-30T23:55:00+03:00,"{'id': 5, 'name': 'Общий зачёт'}",/seasons/53,"[{'id': 23740, 'name': 'Владимир', 'patronymic...",,"{'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ..."
5331,6254,Школьная лига,2019-10-04 19:00:00+03:00,2020-03-22T19:00:00+03:00,"{'id': 5, 'name': 'Общий зачёт'}",/seasons/53,"[{'id': 39218, 'name': 'Владислав', 'patronymi...",,"{'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ..."
5332,6255,ОВСЧ,2019-09-20 20:00:00+03:00,2020-02-19T23:59:00+03:00,"{'id': 5, 'name': 'Общий зачёт'}",/seasons/53,"[{'id': 32901, 'name': 'Наиль', 'patronymic': ...",,"{'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, ..."


In [6]:
valid_tournament_ids = set()
# We need to deal with questions where team has 'X' or '?' in mask, 
for tournament_id, tournament_info in tqdm(results.items()):
    if tournament_id not in tournament_ids:
        continue
    for team in tournament_info:
        team_mask = team.get('mask')
        team_members = [player['player']['id'] for player in team['teamMembers']]
        if team_mask is None or len(team_members) == 0:
            continue
        team_mask = team_mask.replace('X', '0').replace('?', '0')
        valid_tournament_ids.add(tournament_id)

        



100%|██████████| 5528/5528 [00:00<00:00, 13775.65it/s]


In [26]:
tournaments_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687 entries, 4772 to 6370
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          687 non-null    object
 1   dateStart     687 non-null    object
 2   dateEnd       687 non-null    object
 3   type          687 non-null    object
 4   season        687 non-null    object
 5   orgcommittee  687 non-null    object
 6   synchData     428 non-null    object
 7   questionQty   687 non-null    object
dtypes: object(8)
memory usage: 48.3+ KB
