In [1]:
import requests
import json
import pandas as pd

In [2]:
# Define the API key and the base URL
api_key = ""

In [3]:
import requests

class APIClient:
    """Football data API client"""

    BASE_URL = 'https://api.football-data-api.com'

    def __init__(self, key):
        self.key = key

    def _make_request(self, endpoint, params=None):
        """General method for making requests"""
        params = params or {}
        params['key'] = self.key
        response = requests.get(f'{self.BASE_URL}/{endpoint}', params=params)

        response.raise_for_status()

        return response.json()['data']

class LeagueDataClient(APIClient):
    """API client for getting League data"""

    def get_league_list(self, **kwargs):
        return self._make_request('league-list', params=kwargs)

    def get_country_list(self):
        return self._make_request('country-list')

    def get_todays_matches(self, **kwargs):
        return self._make_request('todays-matches', params=kwargs)

    def get_league_matches(self, season_id, **kwargs):
        kwargs.update({'season_id': season_id})
        return self._make_request('league-matches', params=kwargs)

    def get_league_season(self, season_id, **kwargs):
        kwargs.update({'season_id': season_id})
        return self._make_request('league-season', params=kwargs)

    def get_league_teams(self, season_id, **kwargs):
        kwargs.update({'season_id': season_id})
        return self._make_request('league-teams', params=kwargs)

    def get_league_players(self, season_id, **kwargs):
        kwargs.update({'season_id': season_id})
        return self._make_request('league-players', params=kwargs)

    def get_league_referees(self, season_id, **kwargs):
        kwargs.update({'season_id': season_id})
        return self._make_request('league-referees', params=kwargs)

    def get_team(self, team_id):
        return self._make_request('team', {'team_id': team_id})

    def get_match_stats(self, match_id):
        return self._make_request('match', {'match_id': match_id})

In [4]:
client = LeagueDataClient(api_key)

In [5]:
league = pd.DataFrame(client.get_league_list()).query("league_name == 'Premier League'")
league

Unnamed: 0,name,country,league_name,image,season
5,England Premier League,England,Premier League,https://cdn.footystats.org/img/competitions/en...,"[{'id': 9, 'year': 20162017}, {'id': 10, 'year..."


In [6]:
def convert_year_format(data):
    """
    Convert the year from integer format to a string separated by a -

    Parameters:
    data: List of dictionaries containing id and year

    Returns:
    Data with the year converted to the new format
    """
    for item in data:
        year_str = str(item['year'])
        formatted_year = f"{year_str[:4]}-{year_str[4:]}"
        item['year'] = formatted_year

    return data

In [7]:
pml_seasons = sorted(league["season"].to_list()[0], key=lambda x: x["year"])
convert_year_format(pml_seasons)

[{'id': 3137, 'year': '2007-2008'},
 {'id': 3131, 'year': '2008-2009'},
 {'id': 3125, 'year': '2009-2010'},
 {'id': 3121, 'year': '2010-2011'},
 {'id': 3119, 'year': '2011-2012'},
 {'id': 246, 'year': '2012-2013'},
 {'id': 12, 'year': '2013-2014'},
 {'id': 11, 'year': '2014-2015'},
 {'id': 10, 'year': '2015-2016'},
 {'id': 9, 'year': '2016-2017'},
 {'id': 161, 'year': '2017-2018'},
 {'id': 1625, 'year': '2018-2019'},
 {'id': 2012, 'year': '2019-2020'},
 {'id': 4759, 'year': '2020-2021'},
 {'id': 6135, 'year': '2021-2022'},
 {'id': 7704, 'year': '2022-2023'},
 {'id': 9660, 'year': '2023-2024'}]

In [8]:
MATCH_FEATURES = ["homeID", "awayID", "date_unix", "winningTeam", "homeGoalCount", "awayGoalCount", "team_a_shotsOnTarget", "team_b_shotsOnTarget", "team_a_shots", "team_b_shots", "team_a_possession", "team_b_possession",
                  "pre_match_home_ppg", "pre_match_away_ppg", "pre_match_teamA_overall_ppg", "pre_match_teamB_overall_ppg", "o15_potential", "o25_potential", "o35_potential", "o45_potential"]

#### TODO: FIilter status completed and sort by date_unix 

In [27]:
raw_matches = pd.DataFrame(client.get_league_matches(4759))
raw_matches.head(3)

Unnamed: 0,id,homeID,awayID,season,status,roundID,game_week,revised_game_week,homeGoals,awayGoals,...,match_url,competition_id,matches_completed_minimum,over05,over15,over25,over35,over45,over55,btts
0,1056406,145,149,2020/2021,complete,58864,1,-1,[],[71],...,/england/burnley-fc-vs-manchester-united-fc-h2...,4759,38,True,False,False,False,False,False,False
1,1056407,143,146,2020/2021,complete,58864,1,-1,[13],[],...,/england/crystal-palace-fc-vs-southampton-fc-h...,4759,38,True,False,False,False,False,False,False
2,1056408,162,59,2020/2021,complete,58864,1,-1,[],"[8, 49, 57]",...,/england/arsenal-fc-vs-fulham-fc-h2h-stats#105...,4759,38,True,True,True,False,False,False,False


In [28]:
sorted(raw_matches["away_name"].unique())

['Arsenal',
 'Aston Villa',
 'Brighton & Hove Albion',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Fulham',
 'Leeds United',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Sheffield United',
 'Southampton',
 'Tottenham Hotspur',
 'West Bromwich Albion',
 'West Ham United',
 'Wolverhampton Wanderers']

In [26]:
sorted(raw_matches["away_name"].unique())

['Arsenal',
 'Aston Villa',
 'Chelsea',
 'Everton',
 'Fulham',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Norwich City',
 'Queens Park Rangers',
 'Reading',
 'Southampton',
 'Stoke City',
 'Sunderland',
 'Swansea City',
 'Tottenham Hotspur',
 'West Bromwich Albion',
 'West Ham United',
 'Wigan Athletic']

In [12]:
raw_matches.to_csv('/Users/charaka/Desktop/raw_matches.csv', index=False)

In [82]:
rename_cols = {
    'homeID': 'HomeTeam',
    'awayID': 'AwayTeam',
    'homeGoalCount': 'FTHG',
    'awayGoalCount': 'FTAG',
    'team_a_shotsOnTarget': 'HST',
    'team_b_shotsOnTarget': 'AST',
    'team_a_shots': 'HS',
    'team_b_shots': 'AS',
    'team_a_possession': 'HPOSS',
    'team_b_possession': 'APOSS',
}

In [83]:
matches = raw_matches[MATCH_FEATURES].sort_values('date_unix')
matches['Date'] = pd.to_datetime(matches['date_unix'], unit='s')
matches['FTR'] = matches.apply(lambda row: 'H' if row['winningTeam'] == row['homeID'] else ('A' if row['winningTeam'] == row['awayID'] else 'D'), axis=1)
cols_to_drop = ['date_unix', 'winningTeam']
matches = matches.drop(cols_to_drop, axis=1)
matches = matches.rename(columns=rename_cols)

In [84]:
matches.tail(5)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HST,AST,HS,AS,HPOSS,APOSS,pre_match_home_ppg,pre_match_away_ppg,pre_match_teamA_overall_ppg,pre_match_teamB_overall_ppg,o15_potential,o25_potential,o35_potential,o45_potential,Date,FTR
375,154,162,0,3,12,7,20,9,48,52,1.44,0.89,1.24,1.08,73,59,36,14,2013-05-19 15:00:00,A
376,92,156,1,0,21,4,25,6,48,52,1.94,0.89,1.86,1.05,78,64,31,9,2013-05-19 15:00:00,H
377,142,149,5,5,7,7,22,17,32,68,1.67,2.22,1.3,2.38,78,56,28,20,2013-05-19 15:00:00,D
378,153,219,4,2,13,8,22,18,55,45,1.67,0.44,1.16,0.76,73,59,39,20,2013-05-19 15:00:00,H
379,221,158,2,2,8,6,14,8,57,43,0.94,1.11,0.95,1.08,86,73,50,20,2013-05-19 15:00:00,D


In [13]:
MATCH_FEATURES = ["homeID", "awayID", "home_name", "away_name", "date_unix", "winningTeam", "homeGoalCount", "awayGoalCount", "team_a_shotsOnTarget", "team_b_shotsOnTarget", "team_a_shots", "team_b_shots", "team_a_possession", "team_b_possession",
                  "pre_match_home_ppg", "pre_match_away_ppg", "pre_match_teamA_overall_ppg", "pre_match_teamB_overall_ppg", "o15_potential", "o25_potential", "o35_potential", "o45_potential"]

In [14]:
rename_cols = {
    'home_name': 'HomeTeam',
    'away_name': 'AwayTeam',
    'homeGoalCount': 'FTHG',
    'awayGoalCount': 'FTAG',
    'team_a_shotsOnTarget': 'HST',
    'team_b_shotsOnTarget': 'AST',
    'team_a_shots': 'HS',
    'team_b_shots': 'AS',
    'team_a_possession': 'HPOSS',
    'team_b_possession': 'APOSS',
}

cols_to_drop = ['date_unix', 'winningTeam']

In [37]:
import os

# Root folder path
folder_path = "/Users/charaka/Desktop/University/Msc Machine Learning & Data Science/Masters Project/footystats-data"

teams = []
# Loop through the list
for season in pml_seasons[3:]:
    # Create a new folder for the year
    year = season['year']
    id = season['id']

    current_teams = pd.DataFrame(client.get_league_teams(id))
    teams += current_teams['cleanName'].unique().tolist()

In [44]:
len(pd.unique(teams))

40

In [36]:
pd.unique(pd.unique(teams["cleanName"]).tolist() + pd.unique(teams["cleanName"]).tolist())

array(['Arsenal', 'Tottenham Hotspur', 'Manchester City', 'Stoke City',
       'West Bromwich Albion', 'Everton', 'Southampton',
       'Manchester United', 'Liverpool', 'Chelsea', 'West Ham United',
       'Swansea City', 'Sunderland', 'Newcastle United', 'Aston Villa',
       'Norwich City', 'Queens Park Rangers', 'Fulham', 'Reading',
       'Wigan Athletic'], dtype=object)

In [15]:
import os

# Root folder path
folder_path = "/Users/charaka/Desktop/University/Msc Machine Learning & Data Science/Masters Project/footystats-data"

# Loop through the list
for season in pml_seasons[3:]:
    # Create a new folder for the year
    year = season['year']
    id = season['id']

    new_folder_path = os.path.join(folder_path, season['year'])
    os.makedirs(new_folder_path, exist_ok=True)

    raw_matches = pd.DataFrame(client.get_league_matches(id))
    raw_matches = raw_matches[raw_matches['status'] == 'complete']

    matches = raw_matches[MATCH_FEATURES].sort_values('date_unix')
    
    matches['Date'] = pd.to_datetime(matches['date_unix'], unit='s')
    matches['FTR'] = matches.apply(lambda row: 'H' if row['winningTeam'] == row['homeID'] else ('A' if row['winningTeam'] == row['awayID'] else 'D'), axis=1)
    
    matches = matches.drop(cols_to_drop, axis=1)
    matches = matches.rename(columns=rename_cols)

    # Save the DataFrame as a CSV file in the new folder
    matches.to_csv(os.path.join(new_folder_path, "E0.csv"), index=False)
