In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
import pickle

def dump_pickle(file_path, obj):
    with open(file_path, "wb") as dump_file:
        pickle.dump(obj, dump_file)

def load_pickle(file_path):
    with open(file_path, 'rb') as load_file:
        return pickle.load(load_file)

## 1 Анализ данных

Раскроем данные так, чтобы они приняли формат player -> ответ на вопрос

In [3]:
df_tournaments = pd.DataFrame(pd.read_pickle('tournaments.pkl')).transpose()
#отфильтруем по дате
tournaments_ids_all = set(df_tournaments[df_tournaments.dateStart >= '2019-01-01']['id'])
tournaments_ids_test = set(df_tournaments[df_tournaments.dateStart >= '2020-01-01']['id'])
tournaments_ids_train = tournaments_ids_all.difference(tournaments_ids_test)
len(tournaments_ids_all), len(tournaments_ids_train), len(tournaments_ids_test)

(1109, 687, 422)

In [4]:
def filter_df(tournaments_ids):
    df_results = pd.read_pickle('results.pkl')
    print("full df: ", len(df_results))
    results_all = {}
    for key, value in df_results.items():
        # игнорируем турниры до 2019 года, а также пустые записи
        if key in tournaments_ids and len(value) > 0:
            valid = True
            # игнорируем турниры, где нет нужных нам валидных полей
            for team_data in value:
                if 'team' not in team_data or 'mask' not in team_data or 'teamMembers' not in team_data:
                    valid = False
                    continue
                if team_data['mask'] is None or team_data['team'] is None or team_data['teamMembers'] is None:
                    valid = False
                    continue
            if valid:
                results_all[key] = value
    print("filtered df: ", len(results_all))
    return results_all

df_train = filter_df(tournaments_ids_train)
df_test = filter_df(tournaments_ids_test)
dump_pickle('test.pkl', df_test)

def unpivot_players(df):
    df_results_cleaned = []
    for key, value in df.items():
        for team_data in value:
            team = team_data['team']
            mask = str(team_data['mask']).replace('X', '0').replace('?', '0')
            players = team_data['teamMembers']
            for player in players:
                df_results_cleaned.append([key, team['id'], player['player']['id'], mask])
    df = pd.DataFrame(df_results_cleaned)
    df.columns = ['tournament_id', 'team_id', 'player_id', 'mask']
    return df

df_train = unpivot_players(df_train)

def unpivot_questions(df):
    df_results_cleaned = []
    for _, row in df.iterrows():
        tournament_id = row['tournament_id']
        team_id = row['team_id']
        player_id = row['player_id']
        mask = row['mask']
        for idx in range(len(mask)):
            df_results_cleaned.append([tournament_id, team_id, player_id, idx, mask[idx]])
    df = pd.DataFrame(df_results_cleaned)
    df.columns = ['tournament_id', 'team_id', 'player_id', 'question_local_id', 'target']
    return df

df_train = unpivot_questions(df_train)
compression_opts = dict(method='zip', archive_name='train.csv')
df_train.to_csv('train.zip', index=False, compression=compression_opts)

full df:  5528
filtered df:  671
full df:  5528
filtered df:  169


## 2 Baseline model

Построим модель на one-hot векторах из игроков и турниров

In [5]:
df_train = pd.read_csv('train.zip', 
                       dtype={'tournament_id':np.int16, 'team_id':np.int32,
                              'player_id':np.int32, 'question_local_id':np.int32, 'target':np.int8})

In [6]:
df_train.head()

Unnamed: 0,tournament_id,team_id,player_id,question_local_id,target
0,4772,45556,6212,0,1
1,4772,45556,6212,1,1
2,4772,45556,6212,2,1
3,4772,45556,6212,3,1
4,4772,45556,6212,4,1


In [10]:
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression

In [11]:
def sigmoid(x):
    ex = np.exp(x)
    return ex / (1 + ex)

In [12]:
feature_generation = ColumnTransformer(
    transformers=[
        ('OneHot', OneHotEncoder(), ['player_id', 'question_local_id'])
    ],
    remainder='drop',
    sparse_threshold=1
)

soft_pipe = Pipeline(
    steps=[
        ('feature_generation', feature_generation),
        #('scaler', StandardScaler()),
        #('classifier', LogisticRegression(solver='liblinear', max_iter=100))
        ('classifier', LinearRegression())
    ]
)

In [13]:
df_train.shape

(20968351, 5)

In [93]:
%%time
# logistic regression on soft labels
# https://stackoverflow.com/questions/42800769/scikit-learn-classification-on-soft-labels/60969923#60969923

y = np.clip(df_train['target'], 1e-8, 1 - 1e-8)   # numerical stability
inv_sig_y = np.log(y / (1 - y))  # transform to log-odds-ratio space

soft_pipe.fit(df_train, inv_sig_y)

Pipeline(steps=[('feature_generation',
                 ColumnTransformer(sparse_threshold=1,
                                   transformers=[('OneHot', OneHotEncoder(),
                                                  ['player_id',
                                                   'question_local_id'])])),
                ('classifier', LinearRegression())])

## 3 Качество рейтинг-системы

In [15]:
import dill

def dump_dill(file_path, obj):
    with open(file_path, "wb") as dump_file:
        dill.dump(obj, dump_file)
        
def load_dill(file_path):
    with open(file_path, "rb") as dump_file:
        return dill.load(dump_file)
    
dump_dill('logreg_model.dll', pipe)

In [16]:
soft_pipe = load_dill('logreg_model.dll')

In [22]:
# добудем веса из модели для каждого игрока
player_to_weight = {int(name.split('_')[-1]): rank for name, rank in zip(
    soft_pipe['feature_generation'].get_feature_names(),
    soft_pipe['classifier'].coef_) if name.split('_')[2] == 'x0'}

Посмотрим насколько совпадают топ 100 и наш рейтинг

Официальный рейтинг: https://rating.chgk.info/players.php

In [23]:
official_top_100_ids = pd.read_csv('player_off_top.csv')[:100]
official_top_100_ids = set(official_top_100_ids[' ИД'])

player_to_weight_sorted = sorted(player_to_weight.items(), key=lambda kv: kv[1], reverse=True)
predicted_top_100_ids = set(k for k, v in player_to_weight_sorted[:100])

len(official_top_100_ids.intersection(predicted_top_100_ids))

6

In [24]:
from scipy import stats

In [25]:
def get_positions_label(tournament):
    return [team['position'] for team in tournament]

def get_position_prediction(tournament):
    """
    ранжируем команды по весу = (среднее от весов участников),
    есть игрока не было в train -- берем средний вес игрока в трейне
    """
    avg_weight = np.mean(list(player_to_weight.values()))
    team_rating = []
    for idx, team in enumerate(tournament):
        weight = 0
        cnt = 0
        for player_info in team['teamMembers']:
            p_id = player_info['player']['id']
            try:
                weight += player_to_weight[p_id]
            except:
                weight += avg_weight
            cnt += 1
        try:
            mean = weight/cnt
        except:
            mean = 0
        team_rating.append((idx + 1, mean))
    team_rating = sorted(team_rating, key=lambda kv: kv[1], reverse=True)
    return [pos for pos, weight in team_rating]

In [26]:
def get_score(df, corr):
    x = [corr(get_positions_label(t), get_position_prediction(t)).correlation for t in df.values()]
    return np.nanmean(x)

for corr in [('Spearman', stats.spearmanr), ('Kendall ', stats.kendalltau)]:
    print(f'Avg {corr[0]} corr value for df = {get_score(df_test, corr[1])}')

Avg Spearman corr value for df = 0.7044886291365373
Avg Kendall  corr value for df = 0.5481479079052269


## 4 EM схема

Скрытыми переменными $z_{i,q}$ являются ответил ли $i$-ый игрок в команде на вопрос $q$. Пусть $y_{tq}$ - ответ команды $T$ на вопрос $q$. Если команда не ответила на вопрос, то ни один игрок команды не ответил на вопрос, если ответила - то значит хотя бы один игрок ответил.

Величины $z_{i,q}$ будем моделировать логистическими регрессиями от *player_id* и *question_local_id*.

$$p\left(z_{i,q} \vert \overrightarrow{x_i}\right) \sim \sigma\left(\overrightarrow{x_i}\right)$$

Тогда *Expectation* шаг будет рассчитываться по формуле

$$ 
\mathbb{E} \left[ z^{(m+1)}_{i,q} \right] = 
\begin{cases}
0, \text{если } y_{tq} = 0,\\
p\left( z^{(m)}_{i,q} = 1 \vert \exists j \in t : z^{(m)}_{j,q} = 1\right), \text{ если } y_{tq} = 1. 
\end{cases} 
$$

Где
$$
p\left( z^{(m)}_{i,q} = 1 \vert \exists j \in t : z^{(m)}_{j,q} = 1\right) = \frac{\sigma \left(\overrightarrow{x^{(m)}_i}\right)}{1-\prod\limits_{k \in T} \left(1 - \sigma\left(\overrightarrow{x^{(m)}_k}\right)\right)}
$$

На *Maximization* шаге будем бучать модель

$$\mathbb{E} \left[ z^{(m+1)}_{i,q} \right] \sim \sigma\left(\overrightarrow{x^{(m+1)}_i}\right)$$

В качестве начальных параметров возьмем параметры модели из бейзлайна

In [83]:
from functools import reduce

def pred_by_team(arr):
    arr = 1-arr
    return 1 - reduce(lambda x, y: x*y, arr)

In [84]:
df_train['new_target'] = df_train['target']
number_iter = 5

for i in range(number_iter):
    print(f"Iteration number: {i}")
    #if i == 0:
    #    df_train['pred'] = pipe.predict_proba(df_train)[:, 1]
    #if i > 0:
    df_train['pred'] = sigmoid(soft_pipe.predict(df_train))
        
    pred_by_group = (df_train[df_train.target == 1]
             .groupby(['tournament_id', 'team_id', 'question_local_id'], as_index=False)['pred']
             .apply(pred_by_team)
             .rename(columns={'pred':'pred_by_team'}))
    df_train = pd.merge(df_train, pred_by_group, how='left', on=['tournament_id', 'team_id', 'question_local_id'])
    df_train['new_target'] = df_train['pred']/df_train['pred_by_team']
    df_train.loc[df_train['target'] == 0, 'new_target'] = 0

    df_train = df_train.drop(columns=['pred', 'pred_by_team'])

    y = np.clip(df_train['new_target'], 1e-8, 1 - 1e-8)   # numerical stability
    inv_sig_y = np.log(y / (1 - y))  # transform to log-odds-ratio space

    soft_pipe.fit(df_train, inv_sig_y)
    
    # добудем веса из модели для каждого игрока
    player_to_weight = {int(name.split('_')[-1]): rank for name, rank in zip(
        soft_pipe['feature_generation'].get_feature_names(),
        soft_pipe['classifier'].coef_) if name.split('_')[2] == 'x0'}
    
    for corr in [('Spearman', stats.spearmanr), ('Kendall ', stats.kendalltau)]:
        print(f'Avg {corr[0]} corr value for df = {get_score(df_test, corr[1])}')

Iteration number: 0
Avg Spearman corr value for df = 0.6964100942248623
Avg Kendall  corr value for df = 0.5424484626801616
Iteration number: 1
Avg Spearman corr value for df = 0.6902681536903513
Avg Kendall  corr value for df = 0.5326779106657819
Iteration number: 2
Avg Spearman corr value for df = 0.6906155050046141
Avg Kendall  corr value for df = 0.5348640796597474
Iteration number: 3
Avg Spearman corr value for df = 0.6873074547250665
Avg Kendall  corr value for df = 0.5308639726982605
Iteration number: 4
Avg Spearman corr value for df = 0.6886751033625452
Avg Kendall  corr value for df = 0.5331223097260975
