## 1. Подготовка данных

In [None]:
!wget https://www.dropbox.com/s/s4qj0fpsn378m2i/chgk.zip
!unzip chgk.zip

--2020-06-15 05:18:30--  https://www.dropbox.com/s/s4qj0fpsn378m2i/chgk.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.1, 2620:100:6032:1::a27d:5201
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/s4qj0fpsn378m2i/chgk.zip [following]
--2020-06-15 05:18:30--  https://www.dropbox.com/s/raw/s4qj0fpsn378m2i/chgk.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc21be2206ad7b6ec94be7c9ea35.dl.dropboxusercontent.com/cd/0/inline/A5pUQsXt8oBziMRTJKGgpwFp6VPUTILxnOuzJbaf1qBGKU5SxhfxYKTvyJHTS7YLKOJbSdkCsEomA9zrSB06Mo8WErO3yoPlWAHRZTNudU7IGuZ_-f3lCXkAZBbO9HsLWso/file# [following]
--2020-06-15 05:18:30--  https://uc21be2206ad7b6ec94be7c9ea35.dl.dropboxusercontent.com/cd/0/inline/A5pUQsXt8oBziMRTJKGgpwFp6VPUTILxnOuzJbaf1qBGKU5SxhfxYKTvyJHTS7YLKOJbSdkCsEomA9zrSB06Mo8WErO3yoPlWAHRZTNudU7IGuZ_-f3lCXkAZBb

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
from scipy.special import logit, expit
from sklearn.linear_model import LogisticRegression, Ridge

from collections import defaultdict, Counter
from scipy.sparse import coo_matrix

In [None]:
with open("results.pkl", "rb") as f:
  results = pickle.load(f)
with open("tournaments.pkl", "rb") as f:
  tournaments = pickle.load(f)

In [None]:
len(results), len(tournaments)

(5528, 5528)

In [None]:
# Выбираем игры от 2019 года с информацией по ответам и игрокам

results_upd = defaultdict(list)
tournaments_upd = {}

for id, value in results.items():
  y = int(tournaments[id]["dateStart"][0:4])
  if y in [2019, 2020]:
    for team in value:
      mask = team.get("mask", None)
      team_members = team.get("teamMembers", [])
      if mask and team_members:
        results_upd[id].append(team)
  else:
    continue

tournaments_upd = {k: tournaments[k] for k, v in results_upd.items()}
results, tournaments = dict(results_upd), tournaments_upd

In [None]:
len(results), len(tournaments)

(848, 848)

In [None]:
# Делаем разбивку по годам на тест и трейн

train_data, test_data = {}, {}
for id, tournament in results.items():
  y = int(tournaments[id]["dateStart"][0:4])
  
  if y == 2019:
    train_data[id] = {"tournament_name": tournaments[id]["name"]}
    team_results = []
    for team_result in tournament:
      team_info = {"team_id": team_result["team"]["id"], "mask": team_result["mask"], "position": team_result["position"],
        "teamMembers": [team_member["player"]["id"] for team_member in team_result['teamMembers']]}
      team_results.append(team_info)
    train_data[id]["tournament_result"] = team_results

  elif y == 2020:
    test_data[id] = {"tournament_name": tournaments[id]["name"]}
    team_results = []
    for team_result in tournament:
      team_info = {"team_id": team_result["team"]["id"], "mask": team_result["mask"], "position": team_result["position"],
        "teamMembers": [team_member["player"]["id"] for team_member in team_result['teamMembers']]}
      team_results.append(team_info)
    test_data[id]["tournament_result"] = team_results

In [None]:
len(train_data), len(test_data)

(675, 173)

In [None]:
# Заменяем id игроков, отсутствующих в тесте,
# и игроков, переживших мало игр

games_by_player = Counter()
for tournament in train_data.values():
  for team in tournament["tournament_result"]:
    games_by_player += Counter({member: 1 for member in team["teamMembers"]})

low_exp_players = set()
for member, games in games_by_player.most_common():
  if games <= 5:
    low_exp_players.add(member)

y2019_players = set()
for tournament in train_data.values():
  for team in tournament["tournament_result"]:
    team["teamMembers"] = set([-1 if member in low_exp_players else member for member in team["teamMembers"]])
    y2019_players.update(team["teamMembers"])
        
for tournament in test_data.values():
  for team in tournament["tournament_result"]:
    team["teamMembers"] = set([-1 if member not in y2019_players else member for member in team["teamMembers"]])

In [None]:
len(train_data), len(test_data)

(675, 173)

In [None]:
# Подготавливаем признаки и классы

members_and_questions = set()
for id, tournament in train_data.items():
  for team in tournament["tournament_result"]:
    members_and_questions.update(team["teamMembers"])
    questions_ids = (f"{id}_{question_num}" for question_num in range(len(team["mask"])))
    members_and_questions.update(questions_ids)
members_and_questions = {v: i for i, v in enumerate(members_and_questions)}

In [None]:
def prepare_data(data, train):
  global members_and_questions
  rows = []
  cols = []
  y = []
  current_row = 0
  for id, tournament in data.items():
    for team in tournament["tournament_result"]:
      for quest_numb, mask in enumerate(team["mask"]):
        try:
          y.extend([int(mask)] * len(team["teamMembers"]))
        except ValueError:
          continue
        for member in team["teamMembers"]:
          rows.append(current_row)
          cols.append(members_and_questions[member])
          if train:    
            rows.append(current_row)
            cols.append(members_and_questions[f"{id}_{quest_numb}"])
          current_row += 1
              
  rows = np.asarray(rows, dtype=np.int32)
  cols = np.asarray(cols, dtype=np.int32)
  data = np.ones(len(rows))
  y = np.asarray(y, dtype=np.int8)
      
  X = coo_matrix((data, (rows, cols)), shape=(len(y), len(members_and_questions)))
  return X, y

X_train, y_train = prepare_data(train_data, True)
X_test, y_test = prepare_data(test_data, False)

In [None]:
X_train.shape, len(y_train), X_test.shape, len(y_test)

((18000070, 49836), 18000070, (3795338, 49836), 3795338)

## 2. Построение baseline-модели

In [None]:
# С помощью модели логистической регресии будем предсказывать вероятность правильного ответа
# для отдельного игрока на конкретный вопрос, при этом на основе данных 2019 года считаем,
# что если команда ответила на вопрос верно, то и все игроки ответили верно на тот же вопрос

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import log_loss

predict_train = model.predict_proba(X_train)[:, 1]
predict_test = model.predict_proba(X_test)[:, 1]

print (log_loss(y_train, predict_train))
print (log_loss(y_test, predict_test))

0.5058418323613872
0.696304075274913


In [None]:
len(predict_train), len(predict_test)

(18000070, 3795338)

## 3. Качество рейтинг-системы

In [None]:
# Для построения рейтинг-системы необходимо отсортировать команды в зависимости от
# того, как много игроков в них правильно отвечают на вопросы.
# Будем считать, что команда дает верный ответ при еврном ответе хотя бы одного игрока

# Готовим датафрейм с местами команд для тестовых данных

tournament_ids = []
team_ids = []
positions = []
for id, tournament in test_data.items():
  for team in tournament["tournament_result"]:
    tournament_ids.append(id)
    team_ids.append(team["team_id"])
    positions.append(team["position"])

true_positions = pd.DataFrame.from_dict({
  'tournament_id': tournament_ids,
  'team_id': team_ids,
  'position_true': positions,
})

In [None]:
# Готовим массивы с id игр и id команд с тем же размером, что и у предсказаний выше

is_answered = []
tournament_ids_test = []
team_ids_test = []
for id, tournament in test_data.items():
  for team in tournament["tournament_result"]:
    team_id = team["team_id"]
    for answer in team["mask"]:
      try:
        is_answered.extend([int(answer)] * len(team["teamMembers"]))
        tournament_ids_test.extend([id] * len(team["teamMembers"]))
        team_ids_test.extend([team_id] * len(team["teamMembers"]))
      except ValueError:
        continue

tournament_ids_test = np.asarray(tournament_ids_test, dtype=np.int32)
team_ids_test = np.asarray(team_ids_test, dtype=np.int32)

In [None]:
# Готовим датафрейм с предсказанными местами команд для тестовых данных

predicted_positions = pd.DataFrame.from_dict({
  'tournament_id': tournament_ids_test,
  'team_id': team_ids_test,
  '1-predict': 1 - predict_test,
})
    
predicted_positions = predicted_positions.groupby(["tournament_id", "team_id"]).agg("prod").reset_index()
predicted_positions["position_pred"] = predicted_positions.groupby("tournament_id")["1-predict"].rank("dense")

In [None]:
# Считаем корреляции

from scipy.stats import kendalltau, spearmanr

merge = pd.merge(predicted_positions, true_positions, on=["tournament_id", "team_id"])

kendall = []
spearman = []
tournam_ids = merge['tournament_id'].unique()

for tournam_id in tournam_ids:
  kendall += [kendalltau(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation]
  spearman += [spearmanr(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation] 

kendall = np.asarray(kendall)
spearman = np.asarray(spearman)
kendall[np.isnan(kendall)] = 0.0
spearman[np.isnan(spearman)] = 0.0

kendall_corr = np.mean(kendall)
spearman_corr = np.mean(spearman)
print ('Ранговые корреляции Спирмена и Кендалла равны', spearman_corr, 'и', kendall_corr)

Ранговые корреляции Спирмена и Кендалла равны 0.7509334072505259 и 0.598887798571282


## 4. EM-схема для обучения модели

In [None]:
# На Е шаге будем оценивать мат ожидание скрытой переменной z для ответов игроков.
# Z будет определять событие ответа игрока при условии ответа команды.

# На М шаге оцениваем ответы игроков при имеющихся сложностях вопросов, опираясь на z

# Готовим массивы с id игр, id команд, id игроков с тем же размером, что и X_train
# на каждого участника

is_answered = []
tournament_ids_train = []
team_ids_train = []
player_ids_train = []
questions_train = []
for id, tournament in train_data.items():
  for team in tournament["tournament_result"]:
    team_id = team["team_id"]
    for question, answer in enumerate(team["mask"]):
      try:
          is_answered.extend([int(answer)] * len(team["teamMembers"]))
          tournament_ids_train.extend([id] * len(team["teamMembers"]))
          team_ids_train.extend([team_id] * len(team["teamMembers"]))
          questions_train.extend([question] * len(team["teamMembers"]))
      except ValueError:
          continue
      for member in team["teamMembers"]:
          player_ids_train.append(member)

tournament_ids_train = np.asarray(tournament_ids_train, dtype=np.int32)
team_ids_train = np.asarray(team_ids_train, dtype=np.int32)
questions_train = np.asarray(questions_train, dtype=np.int32)

In [None]:
import time

for s in range(1, 11):

  start = time.time()

  # E-шаг

  predicts_and_other_info = pd.DataFrame.from_dict({
    'tournament_id': tournament_ids_train,
    'team_id': team_ids_train,
    'player_id': player_ids_train,
    'questions': questions_train,
    'predict': predict_train,
  })
  predicts_and_other_info["1-predict"] = 1 - predicts_and_other_info["predict"]

  predicts_by_teams = predicts_and_other_info.drop(columns=["player_id", "predict"]).groupby(["tournament_id", "team_id", "questions"]).agg("prod").reset_index()
  predicts_by_teams["team_predict"] = 1 - predicts_by_teams["1-predict"]

  predicts_and_other_info = pd.merge(predicts_and_other_info.drop(columns="1-predict"), predicts_by_teams.drop(columns="1-predict"), on=["tournament_id", "team_id", "questions"])
  predicts_and_other_info["z"] = predicts_and_other_info["predict"] / predicts_and_other_info["team_predict"]

  z = predicts_and_other_info["z"].values
  z = np.where(y_train == 0, 0, z)
  z = np.clip(z, 1e-6, 1 - 1e-6)
  
  # M-шаг

  model = Ridge(alpha=5, solver="auto", tol=0.0001)
  model.fit(X_train, logit(z))
  predict_train = expit(model.predict(X_train))
  predict_test = expit(model.predict(X_test))
  
  # оценка качества
  predicted_positions = pd.DataFrame.from_dict({
    'tournament_id': tournament_ids_test,
    'team_id': team_ids_test,
    '1-predict': 1 - predict_test,
    })
      
  predicted_positions = predicted_positions.groupby(["tournament_id", "team_id"]).agg("prod").reset_index()
  predicted_positions["position_pred"] = predicted_positions.groupby("tournament_id")["1-predict"].rank("dense")

  merge = pd.merge(predicted_positions, true_positions, on=["tournament_id", "team_id"])

  kendall = []
  spearman = []
  tournam_ids = merge['tournament_id'].unique()

  for tournam_id in tournam_ids:
    kendall += [kendalltau(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation]
    spearman += [spearmanr(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation] 

  kendall = np.asarray(kendall)
  spearman = np.asarray(spearman)
  kendall[np.isnan(kendall)] = 0.0
  spearman[np.isnan(spearman)] = 0.0

  kendall_corr = np.mean(kendall)
  spearman_corr = np.mean(spearman)

  print(f"На {s} шаге корреляция Кендалла - {kendall_corr}, корреляция Спирмена: {spearman_corr}\n  время работы - {time.time() - start}")

На 1 шаге корреляция Кендалла - 0.6160538667075308, корреляция Спирмена: 0.7683078057498857
  время работы - 48.904988527297974
На 2 шаге корреляция Кендалла - 0.6207475765802938, корреляция Спирмена: 0.7724053339737121
  время работы - 47.77802872657776
На 3 шаге корреляция Кендалла - 0.6228419180377236, корреляция Спирмена: 0.7743692626011769
  время работы - 49.92509722709656
На 4 шаге корреляция Кендалла - 0.6245305114212129, корреляция Спирмена: 0.7756077552858928
  время работы - 50.63430905342102
На 5 шаге корреляция Кендалла - 0.6247279585776323, корреляция Спирмена: 0.7753385660512798
  время работы - 49.42472243309021
На 6 шаге корреляция Кендалла - 0.6233069069729232, корреляция Спирмена: 0.7745741825497473
  время работы - 48.65809512138367
На 7 шаге корреляция Кендалла - 0.623302541565511, корреляция Спирмена: 0.7745480613230324
  время работы - 48.47633123397827
На 8 шаге корреляция Кендалла - 0.623283416845366, корреляция Спирмена: 0.7745305690550128
  время работы - 48.

## 5. Рейтинг турниров по сложности

In [None]:
tournaments_weights = set()
for id, tournament in train_data.items():
  for team in tournament["tournament_result"]:
    questions_ids = (f"{id}_{question_num}" for question_num in range(len(team["mask"])))
    tournaments_weights.update(questions_ids)
tournaments_weights = {i: int(v.split("_")[0]) for i, v in enumerate(tournaments_weights) if isinstance(v, str)}

In [None]:
# Отсортируем игры по сложности вопросов
# Сложность вопросо будем находить через отношение суммы весов модели на количество вопросов
# для каждого турнира

import heapq
from urllib.parse import quote

tournaments_level = defaultdict(lambda: [0, 0, 0]) # сумма весов, число вопросов, сложность вопросов

for weight_index, weight in enumerate(model.coef_):
    try:
        tournament_id = tournaments_weights[weight_index]
    except KeyError:
        continue
    tournaments_level[tournament_id][0] += weight
    tournaments_level[tournament_id][1] += 1

for _, difficulty in tournaments_level.items():
    difficulty[2] = difficulty[0] / difficulty[1]

hard_tournaments = [
    train_data[key]['tournament_name'] for key, value in tournaments_level.items()
    if -value[2] in heapq.nlargest(10, [-diff[2] for diff in tournaments_level.values()])
]
easy_tournaments = [
    train_data[key]['tournament_name'] for key, value in tournaments_level.items()
    if value[2] in heapq.nlargest(10, [diff[2] for diff in tournaments_level.values()])
]

In [None]:
print('Турниры со сложными вопросами:')
for t in hard_tournaments:
  print (t, 'https://db.chgk.info/search/questions/' + quote(t))

Турниры со сложными вопросами:
sin(CHRON) III https://db.chgk.info/search/questions/sin%28CHRON%29%20III
Чемпионат Выборга https://db.chgk.info/search/questions/%D0%A7%D0%B5%D0%BC%D0%BF%D0%B8%D0%BE%D0%BD%D0%B0%D1%82%20%D0%92%D1%8B%D0%B1%D0%BE%D1%80%D0%B3%D0%B0
Открытый Кубок Серпухова https://db.chgk.info/search/questions/%D0%9E%D1%82%D0%BA%D1%80%D1%8B%D1%82%D1%8B%D0%B9%20%D0%9A%D1%83%D0%B1%D0%BE%D0%BA%20%D0%A1%D0%B5%D1%80%D0%BF%D1%83%D1%85%D0%BE%D0%B2%D0%B0
Славянка без раздаток. 1 этап https://db.chgk.info/search/questions/%D0%A1%D0%BB%D0%B0%D0%B2%D1%8F%D0%BD%D0%BA%D0%B0%20%D0%B1%D0%B5%D0%B7%20%D1%80%D0%B0%D0%B7%D0%B4%D0%B0%D1%82%D0%BE%D0%BA.%201%20%D1%8D%D1%82%D0%B0%D0%BF
Гарри Поттер и 3 по 12 https://db.chgk.info/search/questions/%D0%93%D0%B0%D1%80%D1%80%D0%B8%20%D0%9F%D0%BE%D1%82%D1%82%D0%B5%D1%80%20%D0%B8%203%20%D0%BF%D0%BE%2012
Чемпионат Минска. Лига А. Тур пятый https://db.chgk.info/search/questions/%D0%A7%D0%B5%D0%BC%D0%BF%D0%B8%D0%BE%D0%BD%D0%B0%D1%82%20%D0%9C%D0%B8%D0%BD%D1

In [None]:
print('Турниры с простыми вопросами:')
for t in easy_tournaments:
  print (t, 'https://db.chgk.info/search/questions/' + quote(t))

Турниры с простыми вопросами:
Кубок МТС https://db.chgk.info/search/questions/%D0%9A%D1%83%D0%B1%D0%BE%D0%BA%20%D0%9C%D0%A2%D0%A1
На несколько вопросов больше https://db.chgk.info/search/questions/%D0%9D%D0%B0%20%D0%BD%D0%B5%D1%81%D0%BA%D0%BE%D0%BB%D1%8C%D0%BA%D0%BE%20%D0%B2%D0%BE%D0%BF%D1%80%D0%BE%D1%81%D0%BE%D0%B2%20%D0%B1%D0%BE%D0%BB%D1%8C%D1%88%D0%B5
Серия Premier. Красно-жёлтые дни https://db.chgk.info/search/questions/%D0%A1%D0%B5%D1%80%D0%B8%D1%8F%20Premier.%20%D0%9A%D1%80%D0%B0%D1%81%D0%BD%D0%BE-%D0%B6%D1%91%D0%BB%D1%82%D1%8B%D0%B5%20%D0%B4%D0%BD%D0%B8
Синхронный студенческий Кубок Беларуси https://db.chgk.info/search/questions/%D0%A1%D0%B8%D0%BD%D1%85%D1%80%D0%BE%D0%BD%D0%BD%D1%8B%D0%B9%20%D1%81%D1%82%D1%83%D0%B4%D0%B5%D0%BD%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B9%20%D0%9A%D1%83%D0%B1%D0%BE%D0%BA%20%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%B8
ОВСЧ. 4 этап https://db.chgk.info/search/questions/%D0%9E%D0%92%D0%A1%D0%A7.%204%20%D1%8D%D1%82%D0%B0%D0%BF
Летний гусарский кубок https

## 6. Топ игроков

In [None]:
is_answered = []
player_ids_test = []
questions_test = []
for id, tournament in test_data.items():
  for team in tournament["tournament_result"]:
    for question, answer in enumerate(team["mask"]):
      try:
          is_answered.extend([int(answer)] * len(team["teamMembers"]))
          questions_test.extend([question] * len(team["teamMembers"]))
      except ValueError:
          continue
      for member in team["teamMembers"]:
          player_ids_test.append(member)

questions_test = np.asarray(questions_test, dtype=np.int32)

In [None]:
df_test_players = pd.DataFrame.from_dict({
  'tournament_id': tournament_ids_test,
  'team_id': team_ids_test,
  'player_id': player_ids_test,
  'questions': questions_test,
  'predict': predict_test,
})
df_test_players["1-predict"] = 1 - df_test_players["predict"]
df_test_players["predict10"] = df_test_players["predict"]*10

In [None]:
# Перемножим значения предиктов для каждого игрока
# Значит игроки с более высокики вероятностями ответов и будут иметь высокие позиции

df_test_players_ratio = df_test_players[["player_id", 'predict10']].groupby(["player_id"]).agg("prod").reset_index()
df_test_players_quest_numb = df_test_players[["player_id", 'questions']].groupby(["player_id"]).agg("count").reset_index()
df_test_players_games = df_test_players[["player_id", 'tournament_id']].drop_duplicates().reset_index()
df_test_players_games = df_test_players_games.drop(columns="index").groupby(["player_id"]).agg("count").reset_index()
merge = pd.merge(df_test_players_ratio, df_test_players_quest_numb, on=["player_id"])
merge = pd.merge(merge, df_test_players_games, on=["player_id"])
merge.columns = ['player_id',	'ratio',	'questions',	'tournaments']
merge = merge.sort_values(by = 'ratio', ascending=False)
merge

Unnamed: 0,player_id,ratio,questions,tournaments
548,7008,inf,1033,23
2421,30152,inf,1669,43
2192,27403,4.299292e+146,517,13
1595,19915,5.704903e+137,1047,25
1482,18332,3.230422e+124,1295,33
...,...,...,...,...
6460,116481,0.000000e+00,373,10
6462,116490,0.000000e+00,224,6
6463,116497,0.000000e+00,449,12
6465,116519,0.000000e+00,143,4


In [None]:
best_players = list(merge['player_id'][0:5].values)
best_members = {}
for id, tournament in results.items():
  y = int(tournaments[id]["dateStart"][0:4])
  if y in [2020]:
    for team_results in tournament:
      for member in team_results['teamMembers']:
        member_id = member['player']['id']
        if member_id in best_players:
          best_members[best_players.index(member_id)+1] = [member['player']['name'] + ' ' + member['player']['surname'], member_id]


In [None]:
import requests

def get_real_pos(player_id):
    url = f"https://rating.chgk.info/api/players/{player_id}/rating/last"
    try:
        pos = requests.get(url).json()['rating_position']
    except:
        pos = 0
    return pos

In [None]:
# За счет того, что в разделе подготовки данных исключали игроков с числом игр до 5
# в топ попали игроки, отыгравшие много вопросов

# Реальные рейтинги этих игроков не на самых последних местах, модель справилась неплохо

from sortedcontainers import SortedDict
best_members = SortedDict(best_members)
for k, v in best_members.items():
  print(f"{k}: {v[0]}, отыграно вопросов {merge[merge.player_id==v[1]].questions.values[0]}, отыграно турниров {merge[merge.player_id==v[1]].tournaments.values[0]},",
        f"реальный рейтинг в базе ЧГК {get_real_pos(v[1])}")

1: Алексей Гилёв, отыграно вопросов 1033, отыграно турниров 23, реальный рейтинг в базе ЧГК 30
2: Артём Сорожкин, отыграно вопросов 1669, отыграно турниров 43, реальный рейтинг в базе ЧГК 1
3: Максим Руссо, отыграно вопросов 517, отыграно турниров 13, реальный рейтинг в базе ЧГК 5
4: Александр Марков, отыграно вопросов 1047, отыграно турниров 25, реальный рейтинг в базе ЧГК 51
5: Александр Либер, отыграно вопросов 1295, отыграно турниров 33, реальный рейтинг в базе ЧГК 7


## 7. Обучение на всех данных до 2019 года

In [None]:
# Тут всё скопировано из пунктов 1 и 2, но в трейн берутся не только игры 2019 года

with open("results.pkl", "rb") as f:
  results = pickle.load(f)
with open("tournaments.pkl", "rb") as f:
  tournaments = pickle.load(f)
  
# Выбираем игры от 2019 года с информацией по ответам и игрокам

results_upd = defaultdict(list)
tournaments_upd = {}

for id, value in results.items():
  for team in value:
    mask = team.get("mask", None)
    team_members = team.get("teamMembers", [])
    if mask and team_members:
      results_upd[id].append(team)

tournaments_upd = {k: tournaments[k] for k, v in results_upd.items()}
results, tournaments = dict(results_upd), tournaments_upd

In [None]:
# Делаем разбивку по годам на тест и трейн

train_data, test_data = {}, {}
for id, tournament in results.items():
  y = int(tournaments[id]["dateStart"][0:4])
  
  if y < 2020 and y >= 2015:
    train_data[id] = {"tournament_name": tournaments[id]["name"]}
    team_results = []
    for team_result in tournament:
      team_info = {"team_id": team_result["team"]["id"], "mask": team_result["mask"], "position": team_result["position"],
        "teamMembers": [team_member["player"]["id"] for team_member in team_result['teamMembers']]}
      team_results.append(team_info)
    train_data[id]["tournament_result"] = team_results

  elif y == 2020:
    test_data[id] = {"tournament_name": tournaments[id]["name"]}
    team_results = []
    for team_result in tournament:
      team_info = {"team_id": team_result["team"]["id"], "mask": team_result["mask"], "position": team_result["position"],
        "teamMembers": [team_member["player"]["id"] for team_member in team_result['teamMembers']]}
      team_results.append(team_info)
    test_data[id]["tournament_result"] = team_results

In [None]:
# Заменяем id игроков, отсутствующих в тесте,
# и игроков, переживших мало игр

import tqdm

games_by_player = Counter()
for tournament in tqdm.tqdm_notebook(train_data.values()):
  for team in tournament["tournament_result"]:
    games_by_player += Counter({member: 1 for member in team["teamMembers"]})

low_exp_players = set()
for member, games in tqdm.tqdm_notebook(games_by_player.most_common()):
  if games <= 5:
    low_exp_players.add(member)

y2019_players = set()
for tournament in tqdm.tqdm_notebook(train_data.values()):
  for team in tournament["tournament_result"]:
    team["teamMembers"] = set([-1 if member in low_exp_players else member for member in team["teamMembers"]])
    y2019_players.update(team["teamMembers"])
        
for tournament in tqdm.tqdm_notebook(test_data.values()):
  for team in tournament["tournament_result"]:
    team["teamMembers"] = set([-1 if member not in y2019_players else member for member in team["teamMembers"]])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2661.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=0.0, max=126304.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=2661.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=173.0), HTML(value='')))




In [None]:
import gc

gc.collect()

0

In [None]:
# Подготавливаем признаки и классы

members_and_questions = set()
for id, tournament in train_data.items():
  for team in tournament["tournament_result"]:
    members_and_questions.update(team["teamMembers"])
    questions_ids = (f"{id}_{question_num}" for question_num in range(len(team["mask"])))
    members_and_questions.update(questions_ids)
members_and_questions = {v: i for i, v in enumerate(members_and_questions)}

In [None]:
def prepare_data(data, train):
  global members_and_questions
  rows = []
  cols = []
  y = []
  current_row = 0
  for id, tournament in data.items():
    for team in tournament["tournament_result"]:
      for quest_numb, mask in enumerate(team["mask"]):
        try:
          y.extend([int(mask)] * len(team["teamMembers"]))
        except ValueError:
          continue
        for member in team["teamMembers"]:
          rows.append(current_row)
          cols.append(members_and_questions[member])
          if train:    
            rows.append(current_row)
            cols.append(members_and_questions[f"{id}_{quest_numb}"])
          current_row += 1
              
  rows = np.asarray(rows, dtype=np.int32)
  cols = np.asarray(cols, dtype=np.int32)
  data = np.ones(len(rows))
  y = np.asarray(y, dtype=np.int8)
      
  X = coo_matrix((data, (rows, cols)), shape=(len(y), len(members_and_questions)))
  return X, y

X_train, y_train = prepare_data(train_data, True)
X_test, y_test = prepare_data(test_data, False)

In [None]:
# С помощью модели логистической регресии будем предсказывать вероятность правильного ответа
# для отдельного игрока на конкретный вопрос, при этом на основе данных 2019 года считаем,
# что если команда ответила на вопрос верно, то и все игроки ответили верно на тот же вопрос

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import log_loss

predict_train = model.predict_proba(X_train)[:, 1]
predict_test = model.predict_proba(X_test)[:, 1]

print (log_loss(y_train, predict_train))
print (log_loss(y_test, predict_test))

0.5123950572617051
0.7177309563600245


In [None]:
# Сформировать трейн за все время до 2020 года не вышло, так как не хватало памяти,
# но при расширении с 2015 по 2019 года результаты улучшились на пару десятых для обычной модели

In [None]:
# Для построения рейтинг-системы необходимо отсортировать команды в зависимости от
# того, как много игроков в них правильно отвечают на вопросы.
# Будем считать, что команда дает верный ответ при еврном ответе хотя бы одного игрока

# Готовим датафрейм с местами команд для тестовых данных

tournament_ids = []
team_ids = []
positions = []
for id, tournament in test_data.items():
  for team in tournament["tournament_result"]:
    tournament_ids.append(id)
    team_ids.append(team["team_id"])
    positions.append(team["position"])

true_positions = pd.DataFrame.from_dict({
  'tournament_id': tournament_ids,
  'team_id': team_ids,
  'position_true': positions,
})

In [None]:
# Готовим массивы с id игр и id команд с тем же размером, что и у предсказаний выше

is_answered = []
tournament_ids_test = []
team_ids_test = []
for id, tournament in test_data.items():
  for team in tournament["tournament_result"]:
    team_id = team["team_id"]
    for answer in team["mask"]:
      try:
        is_answered.extend([int(answer)] * len(team["teamMembers"]))
        tournament_ids_test.extend([id] * len(team["teamMembers"]))
        team_ids_test.extend([team_id] * len(team["teamMembers"]))
      except ValueError:
        continue

tournament_ids_test = np.asarray(tournament_ids_test, dtype=np.int32)
team_ids_test = np.asarray(team_ids_test, dtype=np.int32)

In [None]:
# Готовим датафрейм с предсказанными местами команд для тестовых данных

predicted_positions = pd.DataFrame.from_dict({
  'tournament_id': tournament_ids_test,
  'team_id': team_ids_test,
  '1-predict': 1 - predict_test,
})
    
predicted_positions = predicted_positions.groupby(["tournament_id", "team_id"]).agg("prod").reset_index()
predicted_positions["position_pred"] = predicted_positions.groupby("tournament_id")["1-predict"].rank("dense")

In [None]:
# Считаем корреляции

from scipy.stats import kendalltau, spearmanr

merge = pd.merge(predicted_positions, true_positions, on=["tournament_id", "team_id"])

kendall = []
spearman = []
tournam_ids = merge['tournament_id'].unique()

for tournam_id in tournam_ids:
  kendall += [kendalltau(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation]
  spearman += [spearmanr(merge[merge['tournament_id']==tournam_id]["position_pred"], merge[merge['tournament_id']==tournam_id]["position_true"]).correlation] 

kendall = np.asarray(kendall)
spearman = np.asarray(spearman)
kendall[np.isnan(kendall)] = 0.0
spearman[np.isnan(spearman)] = 0.0

kendall_corr = np.mean(kendall)
spearman_corr = np.mean(spearman)
print ('Ранговые корреляции Спирмена и Кендалла равны', spearman_corr, 'и', kendall_corr)

Ранговые корреляции Спирмена и Кендалла равны 0.7605705586778932 и 0.606400632339615


In [None]:
# Прирост корреляции Спирмена — одна десятая, для Кендалла — одна сотая