In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from data_apps_aws.sql import get_db_engine, get_db_data
from data_apps_aws.utils import make_outside_legend
from data_apps_aws.sql import *

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

### Get player data

In [None]:
db_con = get_db_engine('bfv_data')

In [None]:
query = """
SELECT *
FROM nameprism_player_info
"""

In [None]:
all_nationality_info_df = get_db_data(query, db_con)
all_nationality_info_df.head(3)

In [None]:
query = """
SELECT *
FROM match_participants
"""

In [None]:
all_match_participants = get_db_data(query, db_con)
all_match_participants = all_match_participants.loc[:, ['player_name', 'team', 'type', 'team_id', 'person_id']]

In [None]:
all_match_participants.head(3)

## Classifier

In [None]:
player_nationalities = all_nationality_info_df.sort_values(['player_name', 'prob']).groupby('player_name').tail(1)
player_nationalities.head(3)

In [None]:
player_german_or_not = all_nationality_info_df
player_german_or_not = player_german_or_not.query('nationality == "European,German"').copy()

xx_inds_german_players = player_german_or_not['prob'] > 0.5
player_german_or_not.loc[xx_inds_german_players, 'nationality'] = 'german'
player_german_or_not.loc[~xx_inds_german_players, 'nationality'] = 'foreign'
player_german_or_not.rename({'prob': 'prob_german'}, axis=1, inplace=True)
player_german_or_not.head(4)

Investigate individual players

In [None]:
this_player_name = 'Ümüt Tursun'
all_nationality_info_df.query('player_name == @this_player_name').sort_values('prob', ascending=False).head(3)

In [None]:
players_german_or_foreign = all_match_participants.drop_duplicates(subset=['team_id', 'person_id']).merge(player_german_or_not)

In [None]:
players_german_or_foreign

In [None]:
players_german_or_foreign['person_id'].nunique()

In [None]:
team_german_ratio = players_german_or_foreign.groupby(['team_id', 'team', 'nationality'])['person_id'].count().to_frame()
team_german_ratio = team_german_ratio.reset_index().pivot_table(index=['team_id', 'team'], columns='nationality', values='person_id').fillna(0)
team_german_ratio['n_all_persons'] = team_german_ratio.sum(axis=1)
team_german_ratio['german_ratio'] = team_german_ratio['german'] / team_german_ratio['n_all_persons']
team_german_ratio = team_german_ratio.sort_values('german_ratio')
team_german_ratio.head(3)

In [None]:
team_german_ratio['german_ratio'].hist()
plt.xlabel('Ratio of German players')
plt.ylabel('Number of teams')
plt.show()

In [None]:
team_german_ratio = players_german_or_foreign.groupby(['team_id', 'team'])['prob_german'].mean().to_frame()
team_german_ratio.rename({'prob_german': 'german_ratio'}, axis=1, inplace=True)
team_german_ratio.head(2)

In [None]:
team_german_ratio['german_ratio'].hist()
plt.xlabel('Ratio of German players')
plt.ylabel('Number of teams')
plt.show()

## Championship / team lookup

Create a look-up for match-ids and their respective championship:

In [None]:
query = """
SELECT *
FROM match_day_links
"""

match_day_links_raw = get_db_data(query, db_con)

# remove entries without link
championship_match_affil = match_day_links_raw.loc[~match_day_links_raw['link'].isna()].copy()

In [None]:
all_match_ids = []
for this_link in championship_match_affil['link'].values:
    all_match_ids.append(this_link.split('https://www.bfv.de/spiele/')[1])
championship_match_affil['match_id'] = all_match_ids
championship_match_affil.tail(3)

In [None]:
query = """
SELECT match_id, team_id, team
FROM match_participants
"""

In [None]:
all_match_teams = get_db_data(query, db_con)
all_match_teams = all_match_teams.drop_duplicates()

In [None]:
championship_team_affil = championship_match_affil.loc[:, ['championship_id', 'match_id']].merge(all_match_teams)
championship_team_affil = championship_team_affil.drop(columns='match_id')
championship_team_affil.head(3)

## Migration vs fairness

In [None]:
query = """
SELECT *
FROM fairness_tables
"""

fairness_raw = get_db_data(query, db_con)

In [None]:
fairness_info = fairness_raw.loc[:, ['championship_id', 'quote', 'team']].merge(championship_team_affil).drop_duplicates()

In [None]:
fairness_info.head(3)

In [None]:
team_german_ratio.head(3)

In [None]:
club_fairness_nations = fairness_info.merge(team_german_ratio['german_ratio'].reset_index())

In [None]:
plt.scatter(club_fairness_nations['german_ratio'], club_fairness_nations['quote'])
plt.xlabel('Ratio of German players')
plt.ylabel('Fairness score (lower means more fair)')
plt.show()

In [None]:
import statsmodels.api as sm

In [None]:
X = club_fairness_nations['german_ratio'].values
X = sm.add_constant(X)

Y = club_fairness_nations['quote'].values

In [None]:
model = sm.OLS(Y,X)
results = model.fit()
results.params

In [None]:
print(results.t_test([1, 0]))

In [None]:
y_vals = results.predict()

In [None]:
plt.scatter(club_fairness_nations['german_ratio'], club_fairness_nations['quote'])
plt.plot(club_fairness_nations['german_ratio'].values, y_vals, c='red')
plt.xlabel('Ratio of German players')
plt.ylabel('Fairness score (lower means more fair)')
plt.show()

## Quick and dirty analysis

In [None]:
max_inds = all_player_info_df.groupby('player_name')['prob'].idxmax()
best_single_guesses = all_player_info_df.loc[max_inds].sort_values('prob')

In [None]:
best_single_guesses.groupby('nationality')['player_name'].count().sort_values()

In [None]:
14737 / best_single_guesses.shape[0]

In [None]:
highest_three_likelihoods = all_player_info_df.sort_values(['player_name', 'prob'], ascending=False).groupby('player_name').head(3)
highest_three_likelihoods.groupby('player_name')['prob'].sum().sort_values().head(20)