In [2]:
import json, pickle
import warnings
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as pyplot

from tqdm import tqdm

warnings.filterwarnings("ignore")

In [3]:
# Load data from pickle

tournaments = pickle.load(open('data/tournaments.pkl', 'rb'))
results = pickle.load(open('data/results.pkl', 'rb'))
players = pickle.load(open('data/players.pkl', 'rb'))

In [4]:
def hasValidProperties(tournament_info):
    for team in tournament_info:
        team_mask = team.get('mask')
        team_members = [player['player']['id'] for player in team['teamMembers']]
        if team_mask is None or len(team_members) == 0:
            return False
    return True

In [14]:
train_tournaments = {}
test_tournaments = {}
for tournament_id, tournament_info in tqdm(tournaments.items()):
    if hasValidProperties(results[tournament_id]):
        if tournament_info["dateStart"][:4] == "2019":
            train_tournaments[tournament_id] = results[tournament_id]
        elif tournament_info["dateStart"][:4] == "2020":
            test_tournaments[tournament_id] = results[tournament_id]

100%|██████████| 5528/5528 [00:00<00:00, 10041.38it/s]


In [15]:
print(len(train_tournaments))
print(len(test_tournaments))

657
386


# 2

In [26]:
def processTournamentData(tournaments):
    data = {}
    for tournament_id, tournament_info in tqdm(tournaments.items()):
        tournament_data = {}
        for team in tournament_info:
            team_id = team['team']['id']
            tournament_data[team_id] = {}
            tournament_data[team_id]['mask'] = team.get('mask').replace('?', '0').replace('X', '0')
            tournament_data[team_id]['players'] = [player['player']['id'] for player in team['teamMembers']]
        data[tournament_id] = tournament_data
    return data



In [27]:
train_data = processTournamentData(train_tournaments)
test_data = processTournamentData(test_tournaments)

100%|██████████| 657/657 [00:00<00:00, 3867.27it/s]
100%|██████████| 386/386 [00:00<00:00, 8855.28it/s]


In [44]:
# X-vector is OHE vector

temp = []

question_shift = 0

for tournament_id, teams in tqdm(train_data.items()):
    for team_id, team in teams.items():
        mask = np.array([np.int32(answer) for answer in team['mask']])
        players = team['players']
        questions = np.tile(np.arange(question_shift, question_shift + len(mask)), len(players))
        answers = np.array(np.meshgrid(players, mask)).T.reshape(-1, 2)
        answers = np.hstack([
            np.repeat(tournament_id, len(questions)).reshape(-1, 1),
            np.repeat(team_id, len(questions)).reshape(-1, 1),
            answers, 
            questions.reshape(-1, 1)]
        )
        temp.append(answers)
        
    question_shift += len(mask)
        
train_df = pd.DataFrame(np.vstack(temp).astype(np.int32), 
                     columns = ['tournament_id', 'team_id', 'player_id', 'answer', 'question_id'])

100%|██████████| 657/657 [00:06<00:00, 96.94it/s] 


In [45]:
from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(handle_unknown='ignore')

X_train = ohe.fit_transform(train_df[['player_id', 'question_id']])
y_train = train_df['answer']

In [46]:
%%timeit

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'saga', n_jobs=-1)

lr.fit(X_train, y_train)

In [42]:
unique_players = np.unique(train_df['player_id'])
unique_questions = np.unique(train_df['question_id'])
                    
rating = pd.DataFrame({'player_id': unique_players,
                       'strength': lr.coef_[0][:len(unique_players)]})

players_fio = {v['id']: f"{v['name']} {v['surname']}" for _, v in players.items()}
rating['name'] = rating['player_id'].map(players_fio)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'