In [None]:
import numpy as np
from tqdm import tqdm
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
import pickle
import scipy
from pathlib import Path

In [None]:
!pip install prettytable
from prettytable import PrettyTable

In [None]:
# download pretrained baseline models

# !pip install gdown
## 26 MB
# !gdown https://drive.google.com/uc?id=19w7yO0-14U5BoVu0cNBj2UvQjYuszCgP
## 7 MB
# !gdown https://drive.google.com/uc?id=1yQcsfiOb8v2gfH2EwcFxEzEOq-NQaShg

# EDA

In [None]:
def read_all_data(verbose=True):
    print('read data')
    players = pd.read_pickle('chgk/players.pkl')
    print(f'players: {len(players.items())}')
    tournaments = pd.read_pickle('chgk/tournaments.pkl')
    print(f'tournaments: {len(tournaments.items())}')
    results = pd.read_pickle('chgk/results.pkl')
    print(f'results: {len(results.items())}')
    return players, tournaments, results

In [None]:
players, tournaments, results = read_all_data()
players_ = players

In [None]:
# tournaments[5000]

In [None]:
def get_keys(src):
    keys = set()
    for id in src:
        for subj in src[id]:
            keys.add(subj)
    return keys

In [None]:
print(get_keys(players))
print(get_keys(tournaments))

# Подготовка данных

In [None]:
# 2019 - train; 2020 - test
def split_train_test():
    train_ids = []
    test_ids = []
    for t_id in tournaments:
        dateStart = datetime.fromisoformat(tournaments[t_id]['dateStart'])
        if dateStart.date().year == 2019:
            train_ids.append(t_id) # tournaments[t_id]
        if dateStart.date().year == 2020:
            test_ids.append(t_id) # tournaments[t_id]
    return train_ids, test_ids

In [None]:
train_ids, test_ids = split_train_test()
len(train_ids), len(test_ids)

In [None]:
def filter_by_id(src, ids):
    return {i: src[i] for i in ids}

In [None]:
train_tournaments = filter_by_id(tournaments, train_ids)
test_tournaments = filter_by_id(tournaments, test_ids)
train_results = filter_by_id(results, train_ids)
test_results = filter_by_id(results, test_ids)

In [None]:
# keep tournaments with constant mask len

def filter_results_by_mask_len(src):
    result = {}
    for t_id in tqdm(src):
        masks_lens = set()
        for k in src[t_id]:
            if 'mask' not in k:
                continue
            if k['mask'] is not None:
                masks_lens.add(len(k['mask']))
            else:
                continue
        masks_len = len(list(masks_lens))
        if masks_len != 1:
            continue
        result[t_id] = t_id
    return result

In [None]:
train_ids = filter_results_by_mask_len(train_results)
test_ids = filter_results_by_mask_len(test_results)
len(train_ids), len(test_ids)

In [None]:
train_tournaments = filter_by_id(tournaments, train_ids)
test_tournaments = filter_by_id(tournaments, test_ids)
train_results = filter_by_id(results, train_ids)
test_results = filter_by_id(results, test_ids)

In [None]:
# train_results[5000]

In [None]:
def get_player_ids(src):
    result = set()
    for t_id in tqdm(src):
        tournament = src[t_id]
        for team in tournament:
            if 'teamMembers' not in team:
                print(f'missed teamMembers in tournament: {t_id}\nteam: {team}')
                continue
            for member in team['teamMembers']:
                if 'player' not in member:
                    print(f'missed player in tournament: {t_id}')
                    print(f'team: {team["team"] if "team" in team else "Unknown"}')
                    print(f'member: {member}')
                    continue
                if 'id' not in member['player']:
                    print(f'missed id in tournament: {t_id}, team: {team}, member: {member}')
                    continue
                result.add(member['player']['id'])
    return result

In [None]:
train_players = get_player_ids(train_results)
test_players = get_player_ids(test_results)

print(f'{len(train_players)} players in train,\t{len(test_players)} players in test')
print(f'number of players involved either in train or in test:  {len(train_players.union(test_players))}')
print(f'number of players in train, which are not in test:      {len(train_players - test_players)}')
print(f'number of players in test, which are not in train:      {len(test_players - train_players)}')
print(f'number of players, which are both in train and in test: {len(test_players.intersection(train_players))}')


In [None]:
def get_number_of_questions(src):
    result = 0
    for t_id in src:
        tournament = src[t_id]
        team = tournament[0]
        if not 'mask' in team:
            print(f'No mask in \n{team["team"]} in tournament {t_id}')
            continue
        if team['mask'] is None:
            print(f'None mask in \n{team["team"]} in tournament {t_id}')
            continue
        result += len(team['mask'])
    return result

In [None]:
number_of_questions_train = get_number_of_questions(train_results)
number_of_questions_test = get_number_of_questions(test_results)
number_of_questions_train, number_of_questions_test

# Baseline

Используем предположение, что каждый игрок в команде отвечает так же как и вся команда (т.е. соотносим игроков с вопросами, забыв про команды)  
Будем обучать логистическую регрессию  
$$
y=\sigma(W*x + b)
$$  
$$ x $$ - пары (игрок, вопрос)  
$$ y $$ - правильно или нет ответил игрок на вопрос

Таблица "Игрок"-"Вопрос"-"Ответ"

In [None]:
def get_players_qa(src_players, src_results):
    answers = {k: [] for k in src_players}
    questions = {k: [] for k in src_players}
    q_id = 0;
    for t_id in tqdm(src_results):
        tournament = src_results[t_id]
        for team in tournament:
            if team['mask'] is not None:
                for a in team['mask']:
                    for p in team['teamMembers']:
                        answers[p['player']['id']].append(1 if a == '1' else 0)
                        questions[p['player']['id']].append(q_id)
                    q_id += 1
#                 q_id -= len(team['mask'])
    return answers, questions

In [None]:
train_players_answers, train_players_questions = get_players_qa(train_players, train_results)
test_players_answers, test_players_questions = get_players_qa(test_players, test_results)

In [None]:
def get_qa_table(questions, answers):
    result = []
    for pq, pa in tqdm(zip(questions.items(), answers.items())):
#         set_trace()
        if pa[0] != pq[0]:
            print(f'error in question {pq}, and aswer {pa}')
            continue
        if pa[0] not in players:
            print(f'no {pa[0]} in players')
        if len(pa[1]) != len(pq[1]):
#             set_trace()
            print(f'error in question {pq}: len {len(pq[1])}, and aswer {pa}, len {len(pa[1])}')
            continue
        for q, a in zip(pq[1], pa[1]):
            result.append([pa[0], q, a])
    return result

In [None]:
train_qa_table = get_qa_table(train_players_questions, train_players_answers)
test_qa_table = get_qa_table(test_players_questions, test_players_answers)

In [None]:
len(train_qa_table), len(test_qa_table)

In [None]:
train_df = pd.DataFrame(train_qa_table, columns=['players', 'questions', 'answers'])
test_df = pd.DataFrame(test_qa_table, columns=['players', 'questions', 'answers'])

In [None]:
train_df_x, train_df_y = train_df[['players', 'questions']], train_df['answers']
test_df_x, test_df_y = test_df[['players', 'questions']], test_df['answers']
# train_df_x, train_df_y

In [None]:
train_df.to_csv('train_df.csv')
test_df.to_csv('test_df.csv')

In [None]:
train_unique_players = pd.unique(train_df_x['players'])
train_unique_questions = pd.unique(train_df_x['questions'])
# train_unique_players, train_unique_questions

In [None]:
test_unique_players = pd.unique(test_df_x['players'])
test_unique_questions = pd.unique(test_df_x['questions'])
# test_unique_players, test_unique_questions

In [None]:
# how do we train model

def fit_model(df_x, df_y):
    players_onehot = OneHotEncoder().fit(df_x).transform(df_x)
    rating_model = LogisticRegression(verbose=True, max_iter=5000, n_jobs=6).fit(players_onehot, df_y)
    return rating_model

In [None]:
# let's train (takes 30 mins)
# to skip train use pretrained models

train_baseline_model_filename = 'train_baseline_model.dat'
test_baseline_model_filename = 'test_baseline_model.dat'
if not Path(train_baseline_model_filename).exists():
    print('train logistic regression on train data')
    train_rating_model = fit_model(train_df_x, train_df_y)
    print(f'save trained model to {test_baseline_model_filename}')
    with open(train_baseline_model_filename, 'wb') as f:
        pickle.dump(train_rating_model, f)
else:
    print('load trained baseline model')
    with open(train_baseline_model_filename, 'rb') as f:
        train_rating_model = pickle.load(f)
    
if not Path(test_baseline_model_filename).exists():
    print('train logistic regression on test data')
    test_rating_model = fit_model(test_df_x, test_df_y)
    print(f'save trained model to {test_baseline_model_filename}')
    with open(test_baseline_model_filename, 'wb') as f:
        pickle.dump(test_rating_model, f)
else:
    print('load trained baseline model')
    with open(test_baseline_model_filename, 'rb') as f:
        test_rating_model = pickle.load(f)

In [None]:
train_rating_model.coef_, train_rating_model.coef_.shape

In [None]:
test_rating_model.coef_, test_rating_model.coef_.shape

In [None]:
train_unique_players, test_unique_players

In [None]:
def get_rating_players_questions_df(rating_model, unique_players, unique_questions):
    # players
    rating_players = rating_model.coef_[0][:unique_players.size]
    rating_players_df = pd.DataFrame(np.sort(unique_players), columns=['player_id'])
    rating_players_df['rating'] = rating_players
    best_player_rating, worst_player_rating = rating_players.max(), rating_players.min()
    b = worst_player_rating
    k = (best_player_rating - b) / 1.0
    rating_players_df['norm_rating'] = (rating_players - b) / k
    # questions
    rating_questions = rating_model.coef_[0][unique_players.size:]
    rating_questions_df = pd.DataFrame(np.sort(unique_questions), columns=['question_id'])
    rating_questions_df['rating'] = rating_questions
    return rating_players_df, rating_questions_df

In [None]:
train_rating_players_df, train_rating_questions_df = get_rating_players_questions_df(train_rating_model,
                                                                                     train_unique_players,
                                                                                     train_unique_questions)
print('train')
print('players')
print(train_rating_players_df)
print('questions')
print(train_rating_questions_df)

test_rating_players_df, test_rating_questions_df = get_rating_players_questions_df(test_rating_model,
                                                                                   test_unique_players,
                                                                                   test_unique_questions)
print('test')
print('players')
print(test_rating_players_df)
print('questions')
print(test_rating_questions_df)

In [None]:
def print_players_rating(rating_df, topn=100, player_surname=None, player_name=None):
    p_table = PrettyTable()
    p_table.field_names = ["pos", "id", "rating", "player name"]

    for i, row in enumerate(rating_df.sort_values(by='rating', ascending=False).iterrows()):
        up_id, (p_id, p_rating, p_norm_rating) = row
        s, n, pat = players[int(p_id)]["surname"], players[int(p_id)]["name"], players[int(p_id)]["patronymic"]
        if i >= topn:
            break
        p_name = f'{s} {n} {pat}'

        p_table.add_row([i, int(p_id), f'{p_norm_rating:.4f}', p_name])
    print(p_table)

In [None]:
print('2019 (train)')
print_players_rating(train_rating_players_df, topn=10)
print('2020 (test)')
print_players_rating(test_rating_players_df, topn=10)

In [None]:
def get_player_rating_by_name(rating_df, surname, name=None):
    if not isinstance(surname, list):
        surname = [surname]
    p_table = PrettyTable()
    p_table.field_names = ["pos", "id", "rating", "player name"]
    result = []
    for i, row in tqdm(enumerate(rating_df.sort_values(by='rating', ascending=False).iterrows())):
        up_id, (p_id, p_rating, p_norm_rating) = row
        s, n, pat = players[int(p_id)]["surname"], players[int(p_id)]["name"], players[int(p_id)]["patronymic"]
        p_name = f'{s} {n} {pat}'
        if s in surname:
            if name is None:
                result.append([i, int(p_id), f'{p_norm_rating:.4f}', p_name])
                p_table.add_row(result[-1])
            elif name == n:
                result.append([i, int(p_id), f'{p_norm_rating:.4f}', p_name])
                p_table.add_row(result[-1])
    print(p_table)
    return result

In [None]:
probe = ["Иванов"]
print(2019)
_ = get_player_rating_by_name(train_rating_players_df, probe)
print(2020)
_ = get_player_rating_by_name(test_rating_players_df, probe)

In [None]:
# 
# !pip uninstall prettytable
# !pip uninstall gdown