In [2]:
import numpy as np
import pandas as pd
import math
import re
import scipy.stats as sps
from tqdm import tqdm
from datetime import datetime, date
import pickle
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [3]:
!ls data

players.pkl  results.pkl  tournaments.pkl  working


In [4]:
DIR = "data/"

In [5]:
%%time
players = pd.read_pickle(DIR + "players.pkl") 
results = pd.read_pickle(DIR + "results.pkl")
tournaments = pd.read_pickle(DIR + "tournaments.pkl")

CPU times: user 12.8 s, sys: 2.71 s, total: 15.5 s
Wall time: 15.5 s


In [6]:
# split tournaments to train/test
tournaments_df = pd.DataFrame.from_dict(
    tournaments, orient="index", columns=["id", "name", "dateStart"]
)
tournaments_df["dateStart"] = pd.to_datetime(tournaments_df["dateStart"]).apply(lambda x: x.date())
train_tournaments_df = tournaments_df[
    (tournaments_df["dateStart"] >= date(2019, 1, 1))
    & (tournaments_df["dateStart"] <= date(2019, 12, 31))
]
test_tournaments_df = tournaments_df[
    (tournaments_df["dateStart"] >= date(2020, 1, 1))
    & (tournaments_df["dateStart"] <= date(2020, 12, 31))
]

In [7]:
# now, we need to filter data with results
def filter(tour_id: int):
    result = results[tour_id]
    masks = []
    team_ids = []
    team_names = []
    team_position = []
    player_ids = []
    player_names = []
    for team in result:
        questionsTotal = team.get("questionsTotal")
        if questionsTotal is None:
            continue
        if questionsTotal <= 0:
            continue
        position = team.get("position")
        if position is None:
            continue
        masks.append(team.get("mask").replace("X", "0").replace("?", "0"))
        team_ids.append(team["team"]["id"])
        team_names.append(team["team"]["name"])
        team_position.append(position)
        player_ids.append([player["player"]["id"] for player in team["teamMembers"]])
        player_names.append(
            [
                "{} {} {}".format(
                    player["player"]["surname"],
                    player["player"]["name"],
                    player["player"]["patronymic"],
                )
                for player in team["teamMembers"]
            ]
        )

    return masks, team_ids, team_names, team_position, player_ids, player_names


def update_dataframe(dataframe: pd.DataFrame):
    (
        dataframe["mask"],
        dataframe["team_id"],
        dataframe["team_name"],
        dataframe["team_position"],
        dataframe["player_id"],
        dataframe["player_name"],
    ) = zip(*dataframe["id"].apply(filter))
    dataframe = (
        dataframe.explode(
            [
                "mask",
                "team_id",
                "team_name",
                "team_position",
                "player_id",
                "player_name",
            ]
        )
        .explode(["player_id", "player_name"])
        .reset_index(drop=True)
    )
    dataframe.dropna(inplace=True)
    dataframe["mask_len"] = dataframe["mask"].apply(len)
    dataframe = dataframe.merge(
        dataframe.groupby("id")["mask_len"].max().rename("max_len"),
        left_on="id",
        right_index=True,
    )
    dataframe = dataframe[dataframe["mask_len"] == dataframe["max_len"]].reset_index(
        drop=True
    )
    dataframe = dataframe[
        [
            "id",
            "name",
            "team_id",
            "team_name",
            "team_position",
            "player_id",
            "player_name",
            "mask",
            "mask_len",
        ]
    ]
    return dataframe

In [8]:
%%time
train_tournaments_df2 = update_dataframe(train_tournaments_df)
test_tournaments_df2 = update_dataframe(test_tournaments_df)
train_tournaments_df2.to_csv("data/working/train_tournaments_df2.csv", index=False)
test_tournaments_df2.to_csv("data/working/test_tournaments_df2.csv", index=False)
# train_tournaments_df2 = pd.read_csv("data/working/train_tournaments_df2.csv")
# test_tournaments_df2 = pd.read_csv("data/working/test_tournaments_df2.csv")

CPU times: user 1.35 s, sys: 127 ms, total: 1.48 s
Wall time: 1.48 s


#### Now, we need to create raw data about player/question stats (After all we add question rating to their indexes)

In [9]:
def questions_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
    tmp_df = dataframe[['id', 'mask', 'mask_len', 'player_id']]
    tmp_df['points'] = tmp_df['mask'].apply(lambda x: [int(i) for i in x])
    
    tmp_df['question_id'] = tmp_df.apply(lambda x: [(x.id * 10 ** 3 + i) for i in range(x.mask_len)], axis=1)
    tmp_df2 = tmp_df.explode(['points', 'question_id'])
    return tmp_df2

In [10]:
%%time
train_questions_raw = questions_dataframe(train_tournaments_df2)
test_questions_raw = questions_dataframe(test_tournaments_df2)
train_questions_raw.to_csv("data/working/train_questions_raw.csv", index=False)
test_questions_raw.to_csv("data/working/test_questions_raw.csv", index=False)
# train_questions_raw = pd.read_csv("data/working/train_questions_raw.csv")
# test_questions_raw = pd.read_csv("data/working/test_questions_raw.csv")

CPU times: user 14.9 s, sys: 2.2 s, total: 17.1 s
Wall time: 17.1 s


#### As question complexity we can use share of correct answers:

In [11]:
def complexity_estimation(dataframe: pd.DataFrame) -> pd.DataFrame:
    tmp_df = dataframe[['player_id', 'points', 'question_id']]
    tmp_df['correct'] = tmp_df.groupby('question_id')['points'].transform('sum')
    tmp_df['total'] = tmp_df.groupby('question_id')['points'].transform('count')
    tmp_df['complexity'] = tmp_df['correct'] / tmp_df['total']  
    
    return tmp_df[['player_id', 'question_id', 'complexity', 'points']]

In [12]:
%%time
train_questions = complexity_estimation(train_questions_raw)
test_questions = complexity_estimation(test_questions_raw)
train_questions.to_csv("data/working/train_questions.csv", index=False)
test_questions.to_csv("data/working/test_questions.csv", index=False)
# train_questions = pd.read_csv("data/working/train_questions.csv")
# test_questions = pd.read_csv("data/working/train_questions.csv")

CPU times: user 11.3 s, sys: 2.95 s, total: 14.2 s
Wall time: 14.2 s


#### We can start to work with first baseline - analyse how player work with question depends on its complexity
About realisation - we can use personal LogisticRegression for each player or total, but player_id will be a part of features. Above, we create datasets to train and predict. 
1. Train - pairs player_id - complexity in train dataset;
2. Test - pairs player_id and range complexity to predict individual ability of player to work with tasks with different complexity.

In [13]:
X_train = train_questions[['player_id', 'complexity']].values
y_train = train_questions['points'].astype('int').values

In [14]:
%%time
players_model_df = pd.DataFrame(players.keys(), columns=['player_id'])
players_model_df['complexity'] = [np.arange(0, 1, 0.05).tolist()] * players_model_df.shape[0]
players_model_df = players_model_df.explode('complexity')
players_model_df = players_model_df[players_model_df['player_id'].isin(train_questions['player_id'])]
X_predict = players_model_df[['player_id', 'complexity']].values

CPU times: user 729 ms, sys: 291 ms, total: 1.02 s
Wall time: 1.02 s


In [15]:
players_list = players_model_df['player_id'].drop_duplicates().to_list()

Firstly, try to user player_id and complexity as features value:

In [16]:
players_model_df['rates1'] = LogisticRegression(random_state = 42).fit(X_train, y_train).predict_proba(X_predict)[:, 1]
players_model_df['rates1'] = players_model_df['rates1'].apply(lambda x: round(x, 3))

In [17]:
assert players_model_df.groupby(['player_id'])['rates1'].nunique().sum() == players_model_df['player_id'].nunique()

As we can see, way to user player_id in features dataset is not good, because final label depends only on player_id and fill same values to points. I think, as a variant use individual logistic regression for every person.

In [18]:
def train(players_list: list, train_questions: pd.DataFrame, players_model_df: pd.DataFrame) -> pd.DataFrame:
    for player_id in tqdm(players_list):
        X_tmp_train = train_questions.loc[
            train_questions["player_id"] == player_id, "complexity"
        ].values
        y_tmp_train = (
            train_questions.loc[train_questions["player_id"] == player_id, "points"]
            .astype("int")
            .values
        )
        X_tmp_predict = players_model_df.loc[
            players_model_df["player_id"] == player_id, "complexity"
        ].values
        players_model_df.loc[players_model_df["player_id"] == player_id, "rates2"] = (
            LogisticRegression(random_state=42)
            .fit(X_tmp_train.reshape(-1, 1), y_tmp_train)
            .predict_proba(X_tmp_predict.reshape(-1, 1))[:, 1]
        )
    return players_model_df

players_model_df = train(players_list, train_questions, players_model_df)

In [19]:
players_model_df.to_csv("data/working/players_model_df.csv", index=False)
# players_model_df = pd.read_csv("data/working/players_model_df.csv")

Final rate of player - mean rate for every task complexity. It's will decsribe how he will work will average task. Now, we can display list of top-players:

In [20]:
players_model_df2 = players_model_df.groupby("player_id")["rates2"].mean().reset_index()
players_model_df2["player_name"] = players_model_df2["player_id"].apply(
    lambda x: " ".join([i for i in list(players[x].values())[1:] if i is not None])
)
players_model_df2.sort_values(by="rates2", ascending=False).reset_index(drop=True).head(
    10
)

Unnamed: 0,player_id,rates2,player_name
0,121433,0.91594,София Романовна Савенко
1,186002,0.907916,Инга Андрисовна Лоренц
2,202410,0.903794,Валентина Подюкова
3,133504,0.892291,София Евгеньевна Лебедева
4,170977,0.890857,Давид Сергеевич Кан
5,171845,0.890857,Михаил Владимирович Завьялов
6,215496,0.884946,Наталья Артемьева
7,215497,0.884946,Екатерина Горелова
8,215495,0.884946,Юлия Крюкова
9,199963,0.874346,Елена Борисовна Бровченко


As team rate let's take probability of correct prediction by one player in team: $P(team=1) = 1 - \prod P(player=0)$

In [21]:
team_rating_df = test_tournaments_df2.merge(
    players_model_df2, on="player_id", how="left"
)
team_rating_df["inv_rates2"] = 1 - team_rating_df["rates2"]
team_rating_df["team_rate"] = 1 - team_rating_df.groupby(
    ["id", "name", "team_id", "team_name", "team_position"]
)["inv_rates2"].transform("prod")
team_rating_df2 = team_rating_df[['id', 'team_id', 'team_position', 'team_rate']].drop_duplicates()
team_rating_df2['team_rate'] = - team_rating_df2['team_rate']

In [22]:
def estimate_correlation(team_rating_df: pd.DataFrame):
    spearman = 0
    kendall = 0
    tours_list = team_rating_df['id'].drop_duplicates().to_list()
    tours_num = len(tours_list)
    for tour_id in tours_list:
        tmp_df = team_rating_df[team_rating_df['id'] == tour_id]
        tmp_spearman = sps.spearmanr(tmp_df['team_rate'].to_list(), tmp_df['team_position'].to_list())[0]
        tmp_kendall = sps.kendalltau(tmp_df['team_rate'].to_list(), tmp_df['team_position'].to_list())[0]
        if not (np.isnan(tmp_spearman) or np.isnan(tmp_kendall)):
            spearman += tmp_spearman
            kendall += tmp_kendall
    return spearman / tours_num, kendall / tours_num

In [23]:
results = estimate_correlation(team_rating_df2)
print("Spearman correlation on test dataset: ", results[0])
print("Kendall correlation on test dataset: ", results[1])

Spearman correlation on test dataset:  0.6933628862304252
Kendall correlation on test dataset:  0.5396450864708623
