In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from data_apps_aws.sql import get_db_engine, get_db_data
from data_apps_aws.utils import make_outside_legend
from data_apps_aws.sql import *

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

In [None]:
db_con = get_db_engine('bfv_data')

Noteworthy data aspects:
- trainer: first / last name seems to be mixed up --> almost none are found, names are misclassified
- SV Bernried II exists twice
- 

## Names not found

In [None]:
query = """
SELECT *
FROM match_participants
"""

match_participants_raw = get_db_data(query, db_con)

In [None]:
match_participants_raw.head(4)

In [None]:
query = """
SELECT *
FROM page_scan_logging_forebears
"""

name_scans = get_db_data(query, db_con)
name_scans.head(8)

In [None]:
player_scan_results = match_participants_raw.loc[:, ['team', 'player_name', 'first_name', 'type', 'person_id']].copy()
player_scan_results.rename({'first_name': 'name'}, axis=1, inplace=True)
player_scan_results = player_scan_results.merge(name_scans.loc[:, ['name', 'success']], how='left')
player_scan_results.head(4)

In [None]:
# throw out trainers
xx_msk = player_scan_results['type'] == 'trainer'
player_scan_results_no_trainer = player_scan_results.loc[~xx_msk, :]

team_scan_results = player_scan_results_no_trainer.drop(columns='type').groupby(['team', 'person_id'])['success'].sum().copy()
team_scan_results = team_scan_results.to_frame()
xx_msk = team_scan_results['success'] > 1
team_scan_results.loc[xx_msk, 'success'] = 1

team_scan_results = team_scan_results.reset_index()
team_scan_results = team_scan_results.groupby(['team', 'success']).count()
team_scan_results = team_scan_results.reset_index().pivot(index='team', columns='success', values='person_id').fillna(0)
team_scan_results['total'] = team_scan_results.sum(axis=1)
team_scan_results['name_info_exists'] = team_scan_results[1.0] / team_scan_results['total']
team_scan_results.sort_values('name_info_exists').head(10)

In [None]:
player_scan_results_no_trainer.query('team == "FC Espanol"').drop_duplicates()

In [None]:
player_scan_results.groupby(['team'])['person_id'].nunique().sort_values().to_frame().tail(10)

In [None]:
name_scans['success'].value_counts()

In [None]:
player_scan_results_no_trainer.loc[:, ['name', 'success']].drop_duplicates()['success'].value_counts()

In [None]:
player_scan_results_no_trainer[player_scan_results_no_trainer['success'] == 0].loc[:, ['name', 'success']].drop_duplicates().head(20)

## DEV

In [None]:
query = """
SELECT *
FROM match_participants
"""

all_players = get_db_data(query, db_con)

In [None]:
unique_players = all_players.loc[:, ['person_id', 'first_name']].drop_duplicates()

## Name distribution

In [None]:
bfv_most_freq_players = unique_players['first_name'].value_counts().to_frame().reset_index()
bfv_most_freq_players.columns = ['first_name', 'sample_count']
bfv_most_freq_players['name'] = bfv_most_freq_players['first_name'].map(lambda x: x.lower())
bfv_most_freq_players.loc[:, ['name', 'sample_count']].set_index('name').head(40).plot(kind='bar')
plt.show()

## Name origin

In [None]:
query = """
SELECT *
FROM forebears_name_lookup
"""

name_origin_data_raw = get_db_data(query, db_con)

In [None]:
name_origin_data_raw

In [None]:
name_origin_data_raw.groupby('name')['name_count'].sum().sort_values(ascending=False).head(30).to_frame().plot(kind='bar')
plt.title('Global name occurrences')
plt.show()

In [None]:
most_freq_sample_occurs = name_origin_data_raw.query('country == "Germany"').loc[:, ['name', 'name_count']].sort_values('name_count', ascending=False).set_index('name')
most_freq_sample_occurs.head(20).plot(kind='bar')
plt.title('Most frequent occurrences in Germany')
plt.show()

In [None]:
# compute German frequency per name

In [None]:
german_occurs = name_origin_data_raw.query('country == "Germany"').loc[:, ['name', 'name_count']]
german_occurs = german_occurs.rename({'name_count': 'count_germany'}, axis=1)

global_occurs = name_origin_data_raw.loc[:, ['name', 'name_count']].groupby('name').sum()
global_occurs = global_occurs.reset_index()
global_occurs = global_occurs.rename({'name_count': 'count_global'}, axis=1)

german_occur_freq = global_occurs.merge(german_occurs, how='left').fillna(0)
german_occur_freq['german_freq'] = german_occur_freq['count_germany'] / german_occur_freq['count_global']
german_occur_freq = german_occur_freq.sort_values('german_freq', ascending=False)
german_occur_freq.head(30)

In [None]:
german_occur_freq.set_index('name').loc[most_freq_sample_occurs.index, :].head(30)

In [None]:
# pick most frequent country per name
max_idx = name_origin_data_raw.groupby('name')['name_count'].idxmax()
most_freq_occur_countries = name_origin_data_raw.loc[max_idx, ['name', 'country', 'name_count']]
most_freq_occur_countries.sort_values('name')
most_freq_occur_countries = most_freq_occur_countries.reset_index(drop=True)

In [None]:
assert most_freq_occur_countries.shape[0] == len(name_origin_data_raw['name'].unique())

In [None]:
german_occur_freq

In [None]:
name_country_lookup

In [None]:
name_country_lookup = most_freq_occur_countries.merge(german_occur_freq).set_index('name')
name_country_lookup = name_country_lookup.reindex(bfv_most_freq_players['name'])
name_country_lookup.head(40)

In [None]:
name_country_lookup.head(80).loc[:, 'german_freq'].plot(kind='bar')
plt.show()

In [None]:
names_above_freq_threshold = name_country_lookup.dropna()
xx_msk = names_above_freq_threshold['german_freq'] > 0.1
names_above_freq_threshold.loc[xx_msk, :]

In [None]:
classification_threshold = 0.1

country_classification = name_country_lookup.reset_index().loc[:, ['name', 'german_freq']].copy()
country_classification['country'] = 'unknown'

xx_msk = country_classification.loc[:, 'german_freq'] > classification_threshold
country_classification.loc[xx_msk, 'country'] = 'german'

xx_msk = country_classification.loc[:, 'german_freq'] <= classification_threshold
country_classification.loc[xx_msk, 'country'] = 'foreign'

country_classification.drop(columns=['german_freq'], inplace=True)

In [None]:
mapped_players = unique_players.copy()
mapped_players['name'] = mapped_players['first_name'].map(lambda x: x.lower())
mapped_players = mapped_players.merge(country_classification, how='left')

In [None]:
mapped_players['country'].value_countsnts()

## Per team

In [None]:
mapped_players.head(3)

In [None]:
mapped_players.shape

In [None]:
all_players.head(2)

In [None]:
all_players.merge(mapped_players)

In [None]:
club_players = all_players.merge(mapped_players)

In [None]:
club_players.loc[:, ['team', 'match_id', 'person_id']].groupby(['team']).nunique().sort_values('person_id')

In [None]:
club_nations = club_players.loc[:, ['team', 'country', 'person_id']].groupby(['team', 'country'])['person_id'].nunique().to_frame().reset_index()
club_nations = club_nations.pivot(index='team', columns='country', values='person_id').fillna(0)
club_nations['total'] = club_nations.sum(axis=1)
club_nations['german_ratio'] = club_nations['german'] / club_nations['total']
club_nations = club_nations.sort_values('german_ratio')
club_nations.head(20)

In [None]:
club_nations.tail(20)

In [None]:
club_nations.drop(columns='german_ratio').sum()

In [None]:
club_nations['german_ratio'].plot(kind='hist')
plt.show()

In [None]:
all_players.merge(mapped_players).loc[:, ['team', 'match_id', 'person_id', 'country']].groupby(['team']).nunique().sort_values('person_id')

In [None]:
all_players.shape

In [None]:
query = """
SELECT *
FROM fairness_tables
"""

fairness_raw = get_db_data(query, db_con)

In [None]:
fairness_raw

## Dev

In [None]:
name_country_lookup = unique_players.copy()
name_country_lookup['name'] = name_country_lookup['first_name'].map(lambda x: x.lower())
name_country_lookup = name_country_lookup.merge(most_freq_occur_countries)

In [None]:
name_country_lookup.groupby(['name', 'country'])['person_id'].count().sort_values(ascending=False).head(40)

In [None]:
germany_prob_lookup = unique_players.copy()

In [None]:
name_country_lookup.query('country == "Germany"').

In [None]:
germany_prob_lookup