In [51]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import geopy
import requests
import json
import citipy

In [52]:
# Reading initial csv file and data cleaning
path = os.path.join('results.csv')
soccerDF =pd.read_csv(path)
soccerDF['total goals'] = soccerDF['home_score'] + soccerDF['away_score']
soccerDF['date'] = soccerDF['date'].str[:4]
soccerDF['score_dif'] = soccerDF['home_score'] - soccerDF['away_score']

#Adding a result column
result = []
for x in range(len(soccerDF['date'])):
    if soccerDF.iloc[x]['score_dif'] == 0:
        result.append('Draw')
    elif soccerDF.iloc[x]['score_dif'] > 0:
        result.append(soccerDF.iloc[x]['home_team'])
    else:
        result.append(soccerDF.iloc[x]['away_team'])
soccerDF['Result'] = result

In [53]:
soccerDF.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total goals,score_dif,Result
0,1872,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,0,0,Draw
1,1873,England,Scotland,4,2,Friendly,London,England,False,6,2,England
2,1874,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,3,1,Scotland
3,1875,England,Scotland,2,2,Friendly,London,England,False,4,0,Draw
4,1876,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,3,3,Scotland


In [54]:
soccerDF.to_csv('soccer_dataframe.csv')

In [72]:
# Getting all countries using rest country api
base_url = 'https://restcountries.eu/rest/v2/name/'

home = soccerDF['home_team']
home = set(home)

away = soccerDF['away_team']
away = set(away)
combined = away | home

nation = []
capital = []
capital_coordinates = []
for country in combined:
    try:
        response = requests.get(base_url + '{}'.format(country) + '?format=json').json()
        capital_coordinates.append(response[0]['latlng'])
        capital.append(response[0]['capital'])
        nation.append(country)
    except KeyError:
        pass

In [73]:
countryDF = pd.DataFrame({
    'Country': nation,
    'Capital': capital,
    'Coordinates': capital_coordinates,
})
countryDF = countryDF.set_index('Country')

# United kingdom countries had to be hard coded because they were all grouped together
countryDF.loc['United States','Capital'] = 'Washington D.C'
countryDF.loc['United States','Coordinates'] = '[38.9072, 77.0369]'
countryDF.loc['Scotland','Coordinates'] = '[55.9533, 3.1883]'
countryDF.loc['Scotland','Capital'] = 'Edinburgh'
countryDF.loc['England','Coordinates'] = '[51.5074, 0.1278]'
countryDF.loc['England','Capital'] = 'London'
countryDF.loc['Wales','Coordinates'] = '[51.4816, 3.1791]'
countryDF.loc['Wales','Capital'] = 'Cardiff'
countryDF.loc['Ireland','Coordinates'] = '[53.3498, 6.2603]'
countryDF.loc['Ireland','Capital'] = 'Dublin'
countryDF.to_csv('countries.csv')

In [284]:
# Getting coordinates of match location
match_city = list(zip(soccerDF['city'], soccerDF['country']))
match_city = list(set(match_city))

gkey = 'Google key goes here'
target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'key={}&').format(gkey)

match_city_coordinates = []
for each in range(len(match_city)):
    try:
        response = requests.get(target_url + 'address={}, {}'.format(match_city[each][0], match_city[each][1])).json()
        match_city_coordinates.append([response['results'][0]['geometry']['location']['lat'], response['results'][0]['geometry']['location']['lng']])
    except:
        match_city_coordinates.append('NaN')

In [292]:
locationsDF = pd.DataFrame({
    'Match Location': match_city,
    'Match coordinates': match_city_coordinates,
})
locationsDF.head()
locationsDF.to_csv('MatchLocationsCoords.csv', index=False)

In [293]:
from geopy import distance
dist = geopy.distance.distance((21.8852562, -102.2915677), (-15.826691, -47.92182039999999)).miles
dist

4505.322545548147

In [294]:
countryDF2 = countryDF.reset_index()
countryDF2.head()
found_cities = list(countryDF2['Country'])

In [295]:
soccerDFCleaned = soccerDF.copy()
is_found = soccerDFCleaned['home_team'].isin(found_cities)  
soccerDFCleaned = soccerDFCleaned[is_found]
soccerDFCleaned = soccerDFCleaned.reset_index()
is_found = soccerDFCleaned['away_team'].isin(found_cities)  
soccerDFCleaned = soccerDFCleaned[is_found]
soccerDFCleaned = soccerDFCleaned.reset_index()
soccerDFCleaned
soccerDFCleaned.to_csv('cleanedSoccerDF.csv')

In [296]:
home_team_coord = []
away_team_coord = []
for x in range(len(soccerDFCleaned['date'])):
    cityH = soccerDFCleaned.loc[x, 'home_team']
    home_team_coord.append(countryDF.loc[cityH, 'Coordinates'])
    cityA = soccerDFCleaned.loc[x, 'away_team']
    away_team_coord.append(countryDF.loc[cityA, 'Coordinates'])
soccerDFCleaned['Home Team Coord'] = home_team_coord
soccerDFCleaned['Away Team Coord'] = away_team_coord

for x in range(len(home_team_coord)):
    home_team_coord[x] = str(home_team_coord[x]).strip('[]').split(', ')
for x in range(len(away_team_coord)):
    away_team_coord[x] = str(away_team_coord[x]).strip('[]').split(', ')    
soccerDFCleaned['Home Team Coord'] = home_team_coord
soccerDFCleaned['Away Team Coord'] = away_team_coord
soccerDFCleaned.to_csv('Updated_cleaned_DF.csv')

In [297]:
# try to add match location coordinates to main DF
locationsDF.set_index('Match Location')
match_coords = []
for x in range(1, len(soccerDFCleaned['date'])):
    city = ('{}'.format(soccerDFCleaned.loc[x,'city'], '{}'.format(soccerDFCleaned.loc[x,'country']))
    match_coords.append(locationsDF.loc[locationsDF.index(city),'Match coordinates'])
match_coords


SyntaxError: invalid syntax (<ipython-input-297-0aa94300819a>, line 6)

In [1]:
match_city_coordinates
countDF = pd.DataFrame(soccerDFCleaned['city'].value_counts())
countDF.sort_values()

NameError: name 'match_city_coordinates' is not defined