In [37]:
import pandas as pd
import numpy as np
import os

import statsmodels.formula.api as smf
from statsmodels.formula.api import ols, mixedlm
from statsmodels.stats.anova import anova_lm

from scipy.stats import shapiro, ttest_ind, mannwhitneyu

In [41]:
base = os.path.dirname(os.getcwd())
stats_path = os.path.join(base, 'csv files', 'For_Stats.csv')

stats_df = pd.read_csv(stats_path)

In [42]:
same = stats_df[stats_df['is_same'] == 1]['judge_goe']
not_same = stats_df[stats_df['is_same'] == 0]['judge_goe']

In [43]:
print(f"All scores: {len(stats_df)}\n"
      f"Scores equal to the panel median: {len(stats_df[stats_df['goe_dist'] == 0])}\n"
      f"Scores higher or lower than the panel median: {len(stats_df[stats_df['goe_dist'] != 0])}")

All scores: 209619
Scores equal to the panel median: 124272
Scores higher or lower than the panel median: 85347


In [44]:
nx, ny = len(same), len(not_same)
print(f"Sample sizes: {nx}, {ny}")

Sample sizes: 16088, 193531


In [45]:
sample_0 = same.sample(5000, random_state = 42)
sample_1 = not_same.sample(5000, random_state = 42)

In [46]:
stat_0, p_0 = shapiro(sample_0)
stat_1, p_1 = shapiro(sample_1)

In [47]:
print(f"Group is_same=0: p={p_0:.5f}, {'Normal Distribution' if p_0 > 0.05 else 'Not Normal Distribution'}")
print(f"Group is_same=1: p={p_1:.5f}, {'Normal Distribution' if p_0 > 0.05 else 'Not Normal Distribution'}")

Group is_same=0: p=0.00000, Not Normal Distribution
Group is_same=1: p=0.00000, Not Normal Distribution


In [48]:
U1, p = mannwhitneyu(same, not_same, method='asymptotic', alternative="two-sided")
U2 = nx * ny - U1

In [49]:
print(f"\nMann-Whitney U-Test: U1_stat={U1:.4f}, p={p}")
print(f"\nMann-Whitney U-Test: U2_stat={U2:.4f}")


Mann-Whitney U-Test: U1_stat=1706749269.0000, p=7.594038389084582e-95

Mann-Whitney U-Test: U2_stat=1406777459.0000


In [50]:
mi_U1 = nx * ny / 2
sigma_U1 = np.sqrt(nx * ny * (nx + ny + 1) / 12)
Z = (U1 - mi_U1) / sigma_U1

r = Z / np.sqrt(nx + ny)

print(r)

0.04442046403786992


In [51]:
stats_df = stats_df[stats_df['base_value'] > 1.87]

In [52]:
all_country_combinations = []

for sc in stats_df['nation'].unique():
    for jc in stats_df['judge_nation'].unique():
        all_country_combinations.append((sc, jc))

len(all_country_combinations), len(set(all_country_combinations))

(2744, 2744)

In [53]:
skater_judge_com = stats_df.apply(lambda row: row['nation'] + row['judge_nation'], axis = 1)
print(len(skater_judge_com.unique()))
print(skater_judge_com.value_counts().head(43))

1819
JPNUSA    3081
JPNJPN    3041
JPNCAN    2643
USAUSA    2289
JPNKOR    2270
USAJPN    2193
USACAN    2041
JPNFRA    1770
USAKOR    1603
JPNITA    1442
CANCAN    1429
CANUSA    1271
KORJPN    1240
CANJPN    1215
USAFRA    1199
JPNFIN    1187
KORUSA    1170
KORKOR    1165
RUSRUS    1141
JPNCHN    1133
KORCAN    1127
JPNEST    1066
FRAFRA    1027
JPNGER    1027
USAITA    1010
RUSCAN    1007
RUSJPN     994
RUSUSA     972
JPNBEL     961
JPNRUS     917
JPNSUI     893
CANKOR     826
FRAJPN     802
USAEST     794
JPNAUS     783
FRAUSA     780
JPNCZE     765
CHNJPN     744
CHNCHN     725
USAFIN     724
USACHN     718
ITAITA     707
USARUS     706
Name: count, dtype: int64


In [54]:
stats_df['skater_judge'] = stats_df.apply(lambda row: row['nation'] + row['judge_nation'], axis = 1)

In [55]:
keep = set(stats_df['skater_judge'].value_counts().index[:43])
stats_trim = stats_df[stats_df['skater_judge'].isin(keep)]
stats_trim = stats_trim[stats_trim['judge_goe'] > 1.]

In [56]:
stats_trim['uid'] = stats_trim.apply(lambda row: row['element'] + row['name'], axis=1)
stats_trim

Unnamed: 0,rank,name,nation,startnr,total,tech,pcs,deductions,competition,element,...,judge_goe,judge_nation,is_same,judge_name,goe_dist,higher,lower,pt_bias,skater_judge,uid
2404,1,Kao MIURA,JPN,19,91.90,51.10,40.80,0.0,fc2023SEG001OF,3A,...,3.0,KOR,0,Na Young AHN,0.0,0,0,0,JPNKOR,3AKao MIURA
2405,1,Kao MIURA,JPN,19,91.90,51.10,40.80,0.0,fc2023SEG001OF,FCSp4,...,2.0,KOR,0,Na Young AHN,0.0,0,0,0,JPNKOR,FCSp4Kao MIURA
2406,1,Kao MIURA,JPN,19,91.90,51.10,40.80,0.0,fc2023SEG001OF,4T+3T,...,4.0,KOR,0,Na Young AHN,-1.0,1,0,0,JPNKOR,4T+3TKao MIURA
2407,1,Kao MIURA,JPN,19,91.90,51.10,40.80,0.0,fc2023SEG001OF,CSSp4,...,3.0,KOR,0,Na Young AHN,-1.0,1,0,0,JPNKOR,CSSp4Kao MIURA
2408,1,Kao MIURA,JPN,19,91.90,51.10,40.80,0.0,fc2023SEG001OF,StSq3,...,3.0,KOR,0,Na Young AHN,0.0,0,0,0,JPNKOR,StSq3Kao MIURA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209487,14,Lorine SCHILD,FRA,10,117.31,63.42,53.89,0.0,wc2025SEG004OF,3Lz+3T,...,2.0,FRA,1,Florence VUYLSTEKER,0.0,0,0,0,FRAFRA,3Lz+3TLorine SCHILD
209493,14,Lorine SCHILD,FRA,10,117.31,63.42,53.89,0.0,wc2025SEG004OF,StSq3,...,2.0,FRA,1,Florence VUYLSTEKER,-2.0,1,0,0,FRAFRA,StSq3Lorine SCHILD
209495,14,Lorine SCHILD,FRA,10,117.31,63.42,53.89,0.0,wc2025SEG004OF,ChSq1,...,3.0,FRA,1,Florence VUYLSTEKER,-1.0,1,0,0,FRAFRA,ChSq1Lorine SCHILD
209496,14,Lorine SCHILD,FRA,10,117.31,63.42,53.89,0.0,wc2025SEG004OF,3S,...,2.0,FRA,1,Florence VUYLSTEKER,-1.0,1,0,0,FRAFRA,3SLorine SCHILD


In [57]:
quality = ols('judge_goe ~ C(uid, Treatment) + C(is_same, Treatment)', data=stats_trim).fit()
quality.summary()

0,1,2,3
Dep. Variable:,judge_goe,R-squared:,0.425
Model:,OLS,Adj. R-squared:,0.374
Method:,Least Squares,F-statistic:,8.412
Date:,"pt., 25 kwi 2025",Prob (F-statistic):,0.0
Time:,17:16:51,Log-Likelihood:,-29047.0
No. Observations:,33120,AIC:,63440.0
Df Residuals:,30448,BIC:,85900.0
Df Model:,2671,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.0000,0.607,3.297,0.001,0.811,3.189
"C(uid, Treatment)[T.2A+1Eu+2SRika KIHIRA]",-4.849e-12,0.649,-7.48e-12,1.000,-1.271,1.271
"C(uid, Treatment)[T.2A+1Eu+3FYuna AOKI]",-4.491e-12,0.858,-5.23e-12,1.000,-1.682,1.682
"C(uid, Treatment)[T.2A+1Eu+3SElizaveta TUKTAMYSHEVA]",0.4743,0.655,0.724,0.469,-0.810,1.759
"C(uid, Treatment)[T.2A+1Eu+3SHongyi CHEN]",-0.0386,0.743,-0.052,0.959,-1.495,1.418
"C(uid, Treatment)[T.2A+1Eu+3SKaori SAKAMOTO]",1.9038,0.631,3.015,0.003,0.666,3.141
"C(uid, Treatment)[T.2A+1Eu+3SKoshiro SHIMADA]",-4.464e-12,0.858,-5.2e-12,1.000,-1.682,1.682
"C(uid, Treatment)[T.2A+1Eu+3SLiam KAPEIKIS]",-0.0257,0.700,-0.037,0.971,-1.399,1.347
"C(uid, Treatment)[T.2A+1Eu+3SRika KIHIRA]",0.9279,0.625,1.484,0.138,-0.298,2.153

0,1,2,3
Omnibus:,679.86,Durbin-Watson:,1.615
Prob(Omnibus):,0.0,Jarque-Bera (JB):,741.279
Skew:,0.334,Prob(JB):,1.0800000000000001e-161
Kurtosis:,3.304,Cond. No.,9660.0


In [58]:
quality = ols('judge_goe ~ C(uid, Treatment) + C(judge_name, Treatment)', data=stats_trim).fit()
quality.summary()

0,1,2,3
Dep. Variable:,judge_goe,R-squared:,0.459
Model:,OLS,Adj. R-squared:,0.408
Method:,Least Squares,F-statistic:,9.059
Date:,"pt., 25 kwi 2025",Prob (F-statistic):,0.0
Time:,17:17:27,Log-Likelihood:,-28024.0
No. Observations:,33120,AIC:,61720.0
Df Residuals:,30282,BIC:,85590.0
Df Model:,2837,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.7183,0.593,2.897,0.004,0.556,2.881
"C(uid, Treatment)[T.2A+1Eu+2SRika KIHIRA]",0.2053,0.633,0.324,0.746,-1.035,1.446
"C(uid, Treatment)[T.2A+1Eu+3FYuna AOKI]",0.2959,0.836,0.354,0.723,-1.343,1.935
"C(uid, Treatment)[T.2A+1Eu+3SElizaveta TUKTAMYSHEVA]",0.6177,0.640,0.966,0.334,-0.636,1.871
"C(uid, Treatment)[T.2A+1Eu+3SHongyi CHEN]",0.3304,0.725,0.456,0.648,-1.090,1.751
"C(uid, Treatment)[T.2A+1Eu+3SKaori SAKAMOTO]",2.1469,0.616,3.483,0.000,0.939,3.355
"C(uid, Treatment)[T.2A+1Eu+3SKoshiro SHIMADA]",-0.3512,0.842,-0.417,0.677,-2.001,1.299
"C(uid, Treatment)[T.2A+1Eu+3SLiam KAPEIKIS]",0.2453,0.684,0.359,0.720,-1.095,1.586
"C(uid, Treatment)[T.2A+1Eu+3SRika KIHIRA]",1.2124,0.610,1.986,0.047,0.016,2.409

0,1,2,3
Omnibus:,489.013,Durbin-Watson:,1.706
Prob(Omnibus):,0.0,Jarque-Bera (JB):,530.043
Skew:,0.274,Prob(JB):,7.990000000000001e-116
Kurtosis:,3.29,Cond. No.,9500.0


In [59]:
model = smf.ols("judge_goe ~ is_same + prestige + s_progression + panel_median", data=stats_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              judge_goe   R-squared:                       0.892
Model:                            OLS   Adj. R-squared:                  0.892
Method:                 Least Squares   F-statistic:                 4.212e+05
Date:                pt., 25 kwi 2025   Prob (F-statistic):               0.00
Time:                        17:17:29   Log-Likelihood:            -2.3252e+05
No. Observations:              204687   AIC:                         4.650e+05
Df Residuals:                  204682   BIC:                         4.651e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.0198      0.006     -3.363

In [None]:
model = mixedlm("judge_goe ~ 1", data=stats_trim, groups=stats_trim["is_same", ""])
result = model.fit()
print(result.summary())

KeyError: ('is_same', 'prestige')