In [1]:
import os
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from scipy.sparse import vstack, hstack
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Task 1

1. Прочитайте и проанализируйте данные, выберите турниры, в которых есть данные о составах команд и повопросных результатах (поле mask в results.pkl). Для унификации предлагаю:
* взять в тренировочный набор турниры с dateStart из 2019 года;
* в тестовый — турниры с dateStart из 2020 года.

In [2]:
DATA_PLAYERS_FPATH = "./data/players.pkl"
DATA_RESULTS_FPATH = "./data/results.pkl"
DATA_TOURNAMENTS_FPATH = "./data/tournaments.pkl"
TRAIN_YEAR = 2019
TEST_YEAR = 2020

In [3]:
with open(DATA_PLAYERS_FPATH, "rb") as f:
    data_players = pickle.load(f)
    
with open(DATA_RESULTS_FPATH, "rb") as f:
    data_results = pickle.load(f)
    
with open(DATA_TOURNAMENTS_FPATH, "rb") as f:
    data_tournaments = pickle.load(f)

In [4]:
data_players[1]

{'id': 1, 'name': 'Алексей', 'patronymic': None, 'surname': 'Абабилов'}

In [5]:
data_results[1][0]

{'team': {'id': 242,
  'name': 'Команда Азимова',
  'town': {'id': 21, 'name': 'Баку'}},
 'mask': None,
 'current': {'name': 'Команда Азимова', 'town': {'id': 21, 'name': 'Баку'}},
 'questionsTotal': 0,
 'synchRequest': None,
 'position': 1,
 'controversials': [],
 'flags': [],
 'teamMembers': [{'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 476,
    'name': 'Анар',
    'patronymic': 'Беюкага оглы',
    'surname': 'Азимов'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 878,
    'name': 'Фариз',
    'patronymic': 'Наим оглы',
    'surname': 'Аликишибеков'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 1872,
    'name': 'Аднан',
    'patronymic': 'Фариз оглы',
    'surname': 'Ахундов'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 13721,
    'name': 'Балаш',
    'patronymic': 'Алекпер оглы',
    'surname': 'Касумов'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'play

In [6]:
data_tournaments[1]

{'id': 1,
 'name': 'Чемпионат Южного Кавказа',
 'dateStart': '2003-07-25T00:00:00+04:00',
 'dateEnd': '2003-07-27T00:00:00+04:00',
 'type': {'id': 2, 'name': 'Обычный'},
 'season': '/seasons/1',
 'orgcommittee': [],
 'synchData': None,
 'questionQty': None}

In [7]:
s = set()
for key in data_results.keys():
    for result in data_results[key]:
        mask = result.get("mask")
        if mask is not None:
            s.update(set(mask))
print(s)

{'?', '1', 'X', '0'}


In [48]:
def collect_data_result(tournament_id):
    data_results_collection = []
    collecting_result = dict()
    question_ids = []
    player_ids = []
    for result in data_results[tournament_id]:
        mask = result.get("mask")
        team_members = result.get("teamMembers")
        if team_members is None or len(team_members) < 1:
            continue
        if (mask is not None) and ("?" not in mask) and ("X" not in mask):
            collecting_result["tournament_id"] = tournament_id
            collecting_result["player_ids"] = [team_member["player"]["id"] for team_member in team_members]
            collecting_result["mask"] = list(map(int, mask))
            collecting_result["team_id"] = result["team"]["id"]
            data_results_collection.append(collecting_result)
            question_ids.extend([str(tournament_id) + "." + str(i) for i in range(len(mask))])
            player_ids.extend(collecting_result["player_ids"])
    return data_results_collection, question_ids, player_ids

In [50]:
train_data_results = []
test_data_results = []
train_question_ids = []
train_player_ids = []

train_data_tournaments = dict()
test_data_tournaments = dict()

for tournament_id in data_tournaments.keys():
    date_start = data_tournaments[tournament_id].get("dateStart")
    if date_start:
        year_start = pd.to_datetime(date_start).year
        if TRAIN_YEAR == year_start:
            data_results_collection, question_ids, player_ids = collect_data_result(tournament_id)
            train_data_results.extend(data_results_collection)
            train_data_tournaments[tournament_id] = data_tournaments[tournament_id]
            train_question_ids.extend(question_ids)
            train_player_ids.extend(player_ids)
                
        if TEST_YEAR == year_start:
            data_results_collection, _, _ = collect_data_result(tournament_id)
            test_data_results.extend(data_results_collection)
            test_data_tournaments[tournament_id] = data_tournaments[tournament_id]

train_question_ids = np.array(train_question_ids)
train_player_ids = np.unique(train_player_ids)

In [51]:
print("Train size:", len(train_data_tournaments))
print("Test size:", len(test_data_tournaments))

Train size: 687
Test size: 418


# Task 2

2. Постройте baseline-модель на основе линейной или логистической регрессии, которая будет обучать рейтинг-лист игроков. Замечания и подсказки:
* повопросные результаты — это фактически результаты броска монетки, и их предсказание скорее всего имеет отношение к бинарной классификации;
* в разных турнирах вопросы совсем разного уровня сложности, поэтому модель должна это учитывать; скорее всего, модель должна будет явно обучать не только силу каждого игрока, но и сложность каждого вопроса;
* для baseline-модели можно забыть о командах и считать, что повопросные результаты команды просто относятся к каждому из её игроков.

In [52]:
train_data_tournaments[4772]

{'id': 4772,
 'name': 'Синхрон северных стран. Зимний выпуск',
 'dateStart': '2019-01-05T19:00:00+03:00',
 'dateEnd': '2019-01-09T19:00:00+03:00',
 'type': {'id': 3, 'name': 'Синхрон'},
 'season': '/seasons/52',
 'orgcommittee': [{'id': 28379,
   'name': 'Константин',
   'patronymic': 'Владимирович',
   'surname': 'Сахаров'}],
 'synchData': {'dateRequestsAllowedTo': '2019-01-09T23:59:59+03:00',
  'resultFixesTo': '2019-01-19T23:59:59+03:00',
  'resultsRecapsTo': '2019-01-11T23:59:59+03:00',
  'allowAppealCancel': True,
  'allowNarratorErrorAppeal': False,
  'dateArchivedAt': '2019-01-26T23:59:59+03:00',
  'dateDownloadQuestionsFrom': '2019-01-04T00:00:00+03:00',
  'dateDownloadQuestionsTo': '2019-01-09T19:00:00+03:00',
  'hideQuestionsTo': '2019-01-09T23:59:59+03:00',
  'hideResultsTo': '2019-01-09T23:59:59+03:00',
  'allVerdictsDone': None,
  'instantControversial': True},
 'questionQty': {'1': 12, '2': 12, '3': 12}}

In [55]:
train_data_results[0]

{'tournament_id': 4772,
 'player_ids': [106582, 174472, 185941, 188948],
 'mask': [1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'team_id': 68319}

In [56]:
players_ohe = OneHotEncoder().fit(train_player_ids.reshape(-1, 1))
questions_ohe = OneHotEncoder().fit(train_question_ids.reshape(-1, 1))

In [57]:
x_train = []
y_train = []

for data_result in tqdm(train_data_results, total=len(train_data_results)):
    tournament_id = data_result["tournament_id"]
    mask = data_result["mask"]
    player_ids = data_result["player_ids"]
    question_ids = [str(tournament_id) + "." + str(i) for i in range(len(mask))]
    x_players_train = np.array([np.full((len(mask), ), _id) for _id in player_ids]).reshape(-1, 1)
    x_players_train_ohe = players_ohe.transform(x_players_train)
    x_questions_train = np.tile(question_ids, len(player_ids)).reshape(-1, 1)
    x_questions_train_ohe = questions_ohe.transform(x_questions_train)
    y_train.append(np.tile(mask, len(player_ids)).reshape(-1, 1))
    x_train.append(hstack([x_players_train_ohe, x_questions_train_ohe]))

  0%|          | 0/73399 [00:00<?, ?it/s]

KeyboardInterrupt: 