In [2]:
import numpy as np
import pandas as pd

%matplotlib inline

In [3]:
def sigmoid(x):
    return 1/(1+np.exp(-x))


def nn_prob(shot_data):
    theta1 = np.load('./Neural Net/xG_theta1.npy')
    theta2 = np.load('./Neural Net/xG_theta2.npy')
    X_cv = shot_data.as_matrix(columns=['x', 'y', 'Header',
                                        'Distance', 'Angle'])

    X_cv = np.c_[np.ones(X_cv.shape[0]), X_cv]

    m = X_cv.shape[0]

    a_1 = X_cv.T
    z_2 = np.dot(theta1, a_1)
    a_2 = sigmoid(z_2)
    # WILL ADD A BIAS UNIT SOON
    z_3 = np.dot(theta2, a_2)
    a_3 = sigmoid(z_3)
    h_theta = a_3
    return h_theta.T

def tidy_and_format_data(shot_data):
    '''
    takes shot data, adds distance, angle, colour and a probability
    :param shot_data:
    :return:
    '''
    for index, row in shot_data.iterrows():
        if row['x'] > 240:
            shot_data.set_value(index, 'x', 480 - row['x'])

    shot_data['y'] = shot_data['y'] - 366/2
    shot_data['y'] = shot_data['y'] * -1
    # calculate the angle and the distance from the goal
    shot_data['Angle'] = np.arctan((np.absolute(shot_data['y'])/shot_data['x']))
    shot_data['Distance'] = np.sqrt(shot_data['y']*shot_data['y'] + shot_data['x']*shot_data['x'])

    # assign colours and numbers based on whether the shots were scored or missed
    for index, row in shot_data.iterrows():
        if row['Scored'] == 'Scored':
            shot_data.set_value(index, 'Colour', 'magenta')
            shot_data.set_value(index, 'ScoredBinary', 1)
        else:
            shot_data.set_value(index, 'Colour', 'white')
            shot_data.set_value(index, 'ScoredBinary', 0)

    shot_data['Proba_exp'] = nn_prob(shot_data)

    return shot_data

In [4]:
data = pd.read_csv('./All shots from 17-18/E0/shots.csv', index_col=0)
football_data = pd.read_csv('./Football-data.co.uk/E0/17-18.csv')
football_data['Date'] = pd.to_datetime(football_data['Date'], format='%d/%m/%y')
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')
mappings = pd.read_csv('./All shots from 17-18/E0/mappings.csv', index_col=1, header=None)
data.replace(mappings[0], inplace=True)

data = tidy_and_format_data(data) # add xG values to each shot and a colour etc

In [29]:
new = data.groupby(['PlayerID'], as_index=False)['Proba_exp', 'ScoredBinary'].sum()

In [30]:
new.head()

Unnamed: 0,PlayerID,Proba_exp,ScoredBinary
0,61,1.412531,2.0
1,73,1.134248,2.0
2,77,1.983791,1.0
3,81,0.042842,0.0
4,84,0.329336,0.0


In [31]:
new.sort_values('ScoredBinary', ascending=False, inplace=True)
new.head()

Unnamed: 0,PlayerID,Proba_exp,ScoredBinary
90,897,5.572367,7.0
118,1847,2.487162,6.0
79,856,5.176293,6.0
239,10705,3.6212,5.0
49,564,3.836699,5.0


In [35]:
player_map = data[['PlayerID', 'PlayerName']]
player_map.drop_duplicates(subset='PlayerID', keep='first', inplace=True)
player_map.set_index('PlayerID', drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
player_dict = player_map.to_dict()
player_dict

{'PlayerName': {61: 'Eriksen',
  73: 'Wayne Rooney',
  77: 'Defoe',
  81: 'Ashley Young',
  84: 'Milner',
  86: 'Jordan Hende',
  87: 'Andy Carroll',
  88: 'Welbeck',
  89: 'Phil Jones',
  90: 'Oxlade-Chamb',
  91: 'Martin Kelly',
  111: 'Cabaye',
  113: 'Koscielny',
  115: 'Giroud',
  123: '\xc3\x96zil',
  136: 'Ilkay G\xc3\xbcndog',
  160: 'Holebas',
  206: '206',
  263: 'Shane Long',
  271: 'Stephen Ward',
  272: 'Jonathan Wal',
  275: 'McClean',
  306: 'F\xc3\xa0bregas',
  308: 'David Silva',
  313: 'Juan Mata',
  320: 'Rodr\xc3\xadguez Le',
  342: 'Martin Olsso',
  374: 'Kieran Gibbs',
  377: 'Ramsey',
  380: 'Coquelin',
  382: 'Kyle Bartley',
  411: 'Ciaran Clark',
  413: 'Albrighton',
  414: 'Delph',
  429: 'Lowton',
  437: 'Bertrand',
  441: 'Sturridge',
  446: 'Chalobah',
  448: 'Luiz Moreira',
  449: 'Romeu Vidal',
  454: 'Eden Hazard',
  466: 'Fellaini',
  471: 'Shane Duffy',
  493: 'Mousa Demb\xc3\xa9l',
  523: 'Shelvey',
  534: 'Raheem Sterl',
  540: 'Joe Allen',
  543: 'K

In [37]:
new.replace(player_dict['PlayerName'], inplace=True)

In [26]:
new.set_index('PlayerID', drop=True, inplace=True)

In [38]:
new.to_csv('players.csv')

Unnamed: 0,PlayerID,Proba_exp,ScoredBinary
90,Romelu Lukak,5.572367,7.0
118,Morata,2.487162,6.0
79,Harry Kane,5.176293,6.0
239,Vardy,3.6212,5.0
49,Agüero,3.836699,5.0
45,Raheem Sterl,2.944009,5.0
99,Alexandre La,3.124712,4.0
276,Gabriel Jesu,2.555697,4.0
201,Mohamed Sala,3.331386,4.0
147,Sadio Mané,1.934727,3.0


In [6]:
meanxg = data.groupby(['PlayerID'], as_index=False)['Proba_exp'].mean().sort_values('Proba_exp', ascending=False)

In [7]:
meanxg

Unnamed: 0,PlayerID,Proba_exp
8,89,0.478478
94,922,0.459438
41,466,0.443808
190,7818,0.370478
273,16605,0.365728
127,2178,0.354473
51,574,0.343033
299,22827,0.331167
281,18466,0.305082
151,3655,0.293693
