### SIGN TEST - Check equal size of two samples

#### Check if Heung-Min Son shoots with both feet the same number of times.

##### The sign test is a statistical method to test for consistent differences between pairs of observations, such as the weight of subjects before and after treatment. Given pairs of observations (such as weight pre- and post-treatment) for each subject, the sign test determines if one member of the pair (such as pre-treatment) tends to be greater than (or less than) the other member of the pair (such as post-treatment).

In [1]:
import pandas as pd
import numpy as np
import json
# plotting
import matplotlib.pyplot as plt
#opening data
import os
import pathlib
import warnings

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [4]:
pl_events_path = '../wyscout_data/events/events_England.json'
players_path = '../wyscout_data/players.json'

with open(pl_events_path) as f:
    data = json.load(f)

pl_events_df = pd.DataFrame(data)

with open(players_path) as f:
    data = json.load(f)

players_df = pd.DataFrame(data)

In [7]:
pl_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643150 entries, 0 to 643149
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   eventId       643150 non-null  int64  
 1   subEventName  643150 non-null  object 
 2   tags          643150 non-null  object 
 3   playerId      643150 non-null  int64  
 4   positions     643150 non-null  object 
 5   matchId       643150 non-null  int64  
 6   eventName     643150 non-null  object 
 7   teamId        643150 non-null  int64  
 8   matchPeriod   643150 non-null  object 
 9   eventSec      643150 non-null  float64
 10  subEventId    643150 non-null  object 
 11  id            643150 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 58.9+ MB


In [8]:
pl_events_df['subEventName'].unique()

array(['Simple pass', 'High pass', 'Head pass', 'Air duel',
       'Ground loose ball duel', 'Smart pass', 'Launch',
       'Ground defending duel', 'Ground attacking duel', 'Foul',
       'Free Kick', 'Cross', 'Shot', 'Reflexes', 'Touch', 'Clearance',
       'Ball out of the field', 'Throw in', 'Goal kick', 'Corner',
       'Goalkeeper leaving line', 'Hand pass', 'Acceleration',
       'Save attempt', '', 'Free kick cross', 'Free kick shot',
       'Hand foul', 'Violent Foul', 'Protest', 'Whistle',
       'Late card foul', 'Out of game foul', 'Penalty', 'Time lost foul',
       'Simulation'], dtype=object)

In [12]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603 entries, 0 to 3602
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   passportArea           3603 non-null   object
 1   weight                 3603 non-null   int64 
 2   firstName              3603 non-null   object
 3   middleName             3603 non-null   object
 4   lastName               3603 non-null   object
 5   currentTeamId          3512 non-null   object
 6   birthDate              3603 non-null   object
 7   height                 3603 non-null   int64 
 8   role                   3603 non-null   object
 9   birthArea              3603 non-null   object
 10  wyId                   3603 non-null   int64 
 11  foot                   3603 non-null   object
 12  shortName              3603 non-null   object
 13  currentNationalTeamId  3603 non-null   object
dtypes: int64(3), object(11)
memory usage: 394.2+ KB


In [15]:
# players_df[players_df['lastName'] == 'Son']
# players_df[players_df['lastName'] == 'Son'].wyId

Unnamed: 0,passportArea,weight,firstName,middleName,lastName,currentTeamId,birthDate,height,role,birthArea,wyId,foot,shortName,currentNationalTeamId
1061,"{'name': 'Korea Republic', 'id': '410', 'alpha...",77,Heung-Min,,Son,1624,1992-07-08,183,"{'code2': 'FW', 'code3': 'FWD', 'name': 'Forwa...","{'name': 'Korea Republic', 'id': '410', 'alpha...",14911,right,Son Heung-Min,14855


In [16]:
# only fetch shots
shots = pl_events_df[pl_events_df['subEventName'] == 'Shot']
shots

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
46,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",25413,"[{'y': 41, 'x': 88}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,94.595788,100,177959212
62,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",26150,"[{'y': 52, 'x': 85}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,179.854785,100,177959247
91,10,Shot,"[{'id': 101}, {'id': 403}, {'id': 201}, {'id':...",14763,"[{'y': 52, 'x': 96}, {'y': 100, 'x': 100}]",2499719,Shot,1631,1H,254.745027,100,177959280
128,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1215}, {'id'...",7868,"[{'y': 33, 'x': 81}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,425.824035,100,177959289
249,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1205}, {'id'...",7868,"[{'y': 30, 'x': 75}, {'y': 0, 'x': 0}]",2499719,Shot,1609,1H,815.462015,100,177959429
...,...,...,...,...,...,...,...,...,...,...,...,...
642945,10,Shot,"[{'id': 401}, {'id': 1212}, {'id': 1802}]",8561,"[{'y': 45, 'x': 72}, {'y': 0, 'x': 0}]",2500098,Shot,1633,2H,1972.969422,100,251596053
643023,10,Shot,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",41174,"[{'y': 33, 'x': 86}, {'y': 0, 'x': 0}]",2500098,Shot,1633,2H,2193.887080,100,251596096
643051,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1201}, {'id'...",7879,"[{'y': 62, 'x': 88}, {'y': 100, 'x': 100}]",2500098,Shot,1623,2H,2377.197700,100,251596357
643055,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1206}, {'id'...",145692,"[{'y': 38, 'x': 92}, {'y': 100, 'x': 100}]",2500098,Shot,1623,2H,2381.481625,100,251596359


In [25]:
# get Son's id
son_id = players_df[players_df['lastName'] == 'Son'].wyId.values[0]

In [26]:
# we want shots by Son, so take only those shots with Son's id
son_shots = shots[shots['playerId'] == son_id]
son_shots

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
44012,10,Shot,"[{'id': 402}, {'id': 1216}, {'id': 1802}]",14911,"[{'y': 25, 'x': 87}, {'y': 0, 'x': 0}]",2499746,Shot,1624,1H,41.025320,100,182699784
76728,10,Shot,"[{'id': 1901}, {'id': 401}, {'id': 201}, {'id'...",14911,"[{'y': 28, 'x': 93}, {'y': 0, 'x': 0}]",2499766,Shot,1624,1H,628.104553,100,187932110
76860,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1216}, {'id'...",14911,"[{'y': 50, 'x': 79}, {'y': 0, 'x': 0}]",2499766,Shot,1624,1H,1048.499703,100,187932252
77625,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1201}, {'id'...",14911,"[{'y': 40, 'x': 81}, {'y': 0, 'x': 0}]",2499766,Shot,1624,2H,668.087020,100,187933112
77642,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1206}, {'id'...",14911,"[{'y': 65, 'x': 96}, {'y': 0, 'x': 0}]",2499766,Shot,1624,2H,731.945802,100,187933131
...,...,...,...,...,...,...,...,...,...,...,...,...
554115,10,Shot,"[{'id': 1901}, {'id': 402}, {'id': 201}, {'id'...",14911,"[{'y': 52, 'x': 89}, {'y': 100, 'x': 100}]",2500046,Shot,1624,1H,1375.758010,100,240758480
573280,10,Shot,"[{'id': 401}, {'id': 2101}, {'id': 1802}]",14911,"[{'y': 74, 'x': 79}, {'y': 0, 'x': 0}]",2500057,Shot,1624,2H,2371.454628,100,242913924
578970,10,Shot,"[{'id': 401}, {'id': 201}, {'id': 1205}, {'id'...",14911,"[{'y': 28, 'x': 91}, {'y': 100, 'x': 100}]",2500061,Shot,1624,1H,1345.409728,100,244031718
579497,10,Shot,"[{'id': 402}, {'id': 201}, {'id': 1202}, {'id'...",14911,"[{'y': 45, 'x': 88}, {'y': 100, 'x': 100}]",2500061,Shot,1624,1H,2935.835299,100,244032276


In [39]:
son_shots.tags.values
# 401	Left foot
# 402	Right foot

array([list([{'id': 402}, {'id': 1216}, {'id': 1802}]),
       list([{'id': 1901}, {'id': 401}, {'id': 201}, {'id': 1208}, {'id': 1801}]),
       list([{'id': 402}, {'id': 201}, {'id': 1216}, {'id': 1802}]),
       list([{'id': 402}, {'id': 201}, {'id': 1201}, {'id': 1801}]),
       list([{'id': 402}, {'id': 201}, {'id': 1206}, {'id': 1801}]),
       list([{'id': 401}, {'id': 1215}, {'id': 1802}]),
       list([{'id': 402}, {'id': 2101}, {'id': 1802}]),
       list([{'id': 401}, {'id': 2101}, {'id': 1802}]),
       list([{'id': 101}, {'id': 401}, {'id': 201}, {'id': 1205}, {'id': 1801}]),
       list([{'id': 402}, {'id': 201}, {'id': 1221}, {'id': 1802}]),
       list([{'id': 402}, {'id': 201}, {'id': 1201}, {'id': 1801}]),
       list([{'id': 401}, {'id': 2101}, {'id': 1802}]),
       list([{'id': 101}, {'id': 401}, {'id': 201}, {'id': 1204}, {'id': 1801}]),
       list([{'id': 1901}, {'id': 402}, {'id': 201}, {'id': 1212}, {'id': 1802}]),
       list([{'id': 402}, {'id': 1214}, {'id'

In [44]:
left_footed_shots = son_shots.loc[son_shots.apply(lambda x: {'id': 401} in x.tags, axis=1)]
right_footed_shots = son_shots.loc[son_shots.apply(lambda x: {'id': 402} in x.tags, axis=1)]

In [46]:
len(left_footed_shots), len(right_footed_shots)

(27, 40)

#### Sign test

In [48]:
#create list with ones for left foot shots and -1 for right foot shots
l = [1] * len(left_footed_shots)
l.extend([-1] * len(right_footed_shots))
l

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1]

In [52]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.1-cp312-cp312-macosx_11_0_arm64.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.0.10/lib

In [53]:
from statsmodels.stats.descriptivestats import sign_test

m, pvalue = sign_test(l, mu0=0)

if pvalue < 0.05: print('Heung-Min Son is not ambidextrous')
else: print('Heung-Min Son is ambidextrous')

Heung-Min Son is ambidextrous
