In [1]:
from statsbombpy import sb
import pandas as pd
from tqdm import tqdm
from math import radians, cos, sin, asin, sqrt

In [15]:

stadiums = pd.read_csv("data/stadiums-with-GPS-coordinates.csv")
matches = pd.read_csv("data/premier-league-matches.csv")

# Renaming Columns so it is the same as in the stadiums dataset
matches.rename(columns={"Home":"Team"}, inplace = True)
# Trimming whitespaces in the stadium dataset
stadiums = stadiums.applymap(lambda x: x[:-1] if type(x) == str and x[-1] == " " else x)

# para ser aplicado no stadiums
team_name_map = {
    'Manchester United':'Manchester Utd',
    'Brighton & Hove Albion':'Brighton',
    'Newcastle United': 'Newcastle Utd',
    'West Bromwich Albion' : 'West Brom',
    'Tottenham Hotspur' : 'Tottenham',
    'West Ham United' : 'West Ham',
    'Wolverhampton Wanderers' : 'Wolves'
}

stadiums.Team =  stadiums.Team.replace(team_name_map)

# Contains coordinates and date for each game
display(matches.head())
display(stadiums.head())

matches = matches.merge(stadiums, left_on="Team", right_on='Team')
print(matches.Team.nuniqube())



Unnamed: 0,Season_End_Year,Wk,Date,Team,HomeGoals,AwayGoals,Away,FTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A


Unnamed: 0,Team,FDCOUK,City,Stadium,Capacity,Latitude,Longitude,Country
0,Arsenal,Arsenal,London,Emirates Stadium,60361,51.555,-0.108611,England
1,Aston Villa,Aston Villa,Birmingham,Villa Park,42785,52.509167,-1.884722,England
2,Blackburn Rovers,Blackburn,Blackburn,Ewood Park,31154,53.728611,-2.489167,England
3,Bolton Wanderers,Bolton,Bolton,Reebok Stadium,28100,53.580556,-2.535556,England
4,Chelsea,Chelsea,London,Stamford Bridge,42449,51.481667,-0.191111,England


AttributeError: 'Series' object has no attribute 'nuniqube'

In [16]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [26]:
df = matches.sort_values(by="Date")
times_distancia_viajada = {}
times_ultima_coordenada = {}

for ind in df.index:
    time_casa = df['Team'][ind]
    time_fora = df['Away'][ind]
    ano = df['Season_End_Year'][ind]
    if time_casa not in times_distancia_viajada:
        times_distancia_viajada[time_casa] = list()
    if time_casa not in times_ultima_coordenada:
        lat = df['Latitude'][ind]
        lng = df['Longitude'][ind]
        times_ultima_coordenada[time_casa] = [lat, lng, ano]
    if time_fora not in times_distancia_viajada:
        times_distancia_viajada[time_fora] = list()
    if time_fora not in times_ultima_coordenada:
        lat = df['Latitude'][ind]
        lng = df['Longitude'][ind]
        times_ultima_coordenada[time_fora] = [lat, lng, ano]

col1 = list()
col2 = list()
index = list()
for ind in df.index:
    time_casa = df['Team'][ind]
    time_fora = df['Away'][ind]
    
    if df['Season_End_Year'][ind] != times_ultima_coordenada[time_casa][2]:
        times_ultima_coordenada[time_casa][0] = df['Latitude'][ind]
        times_ultima_coordenada[time_casa][0] = df['Longitude'][ind]
    if df['Season_End_Year'][ind] != times_ultima_coordenada[time_fora][2]:
        times_ultima_coordenada[time_fora][0] = df['Latitude'][ind]
        times_ultima_coordenada[time_fora][0] = df['Longitude'][ind]
        
    lat1 = df['Latitude'][ind]
    lng1 = df['Longitude'][ind]
    home_travel_distance = haversine(lng1,lat1,times_ultima_coordenada[time_casa][1],times_ultima_coordenada[time_casa][0])
    away_travel_distance = haversine(lng1,lat1,times_ultima_coordenada[time_fora][1],times_ultima_coordenada[time_fora][0])
    col1.append(home_travel_distance)
    col2.append(away_travel_distance)
    index.append(ind)
    ano = df['Season_End_Year'][ind]
    times_ultima_coordenada[time_casa] = [lat1, lng1, ano]
    times_ultima_coordenada[time_fora] = [lat1, lng1, ano]
    
d = {'home_travel_distance': pd.Series(col1, index=index), 'away_travel_distance': pd.Series(col2, index=index)}

In [27]:
df_distancias = pd.DataFrame(data=d).sort_index()

In [29]:
dataframe_final = pd.concat([matches, df_distancias], axis=1)

In [30]:
dataframe_final

Unnamed: 0,Season_End_Year,Wk,Date,Team,HomeGoals,AwayGoals,Away,FTR,FDCOUK,City,Stadium,Capacity,Latitude,Longitude,Country,home_travel_distance,away_travel_distance
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H,Coventry,Coventry,Ricoh Arena,32609,52.448056,-1.495556,England,0.000000,0.000000
1,1993,4,1992-08-26,Coventry City,0,1,QPR,A,Coventry,Coventry,Ricoh Arena,32609,52.448056,-1.495556,England,135.610239,124.380354
2,1993,5,1992-08-29,Coventry City,0,2,Blackburn,A,Coventry,Coventry,Ricoh Arena,32609,52.448056,-1.495556,England,0.000000,139.759980
3,1993,8,1992-09-14,Coventry City,1,0,Tottenham,H,Coventry,Coventry,Ricoh Arena,32609,52.448056,-1.495556,England,0.000000,135.610239
4,1993,10,1992-09-26,Coventry City,1,1,Norwich City,D,Coventry,Coventry,Ricoh Arena,32609,52.448056,-1.495556,England,0.000000,190.641456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10311,2023,34,2023-04-29,Brighton,6,0,Wolves,H,Brighton,Brighton,American Express Community Stadium,22374,50.861822,-0.083278,England,69.332394,238.342911
10312,2023,28,2023-05-04,Brighton,1,0,Manchester Utd,H,Brighton,Brighton,American Express Community Stadium,22374,50.861822,-0.083278,England,0.000000,326.070330
10313,2023,35,2023-05-08,Brighton,1,5,Everton,A,Brighton,Brighton,American Express Community Stadium,22374,50.861822,-0.083278,England,0.000000,208.677255
10314,2023,37,2023-05-21,Brighton,3,1,Southampton,H,Brighton,Brighton,American Express Community Stadium,22374,50.861822,-0.083278,England,468.879430,91.876714


In [31]:
dataframe_final.to_pickle('bkp_travel_distances.pkl')