In [501]:
import pandas as pd
import requests
import re
from tqdm import tqdm #progress bar library -- not necessary, but helpful since some of this is slower

In [169]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)
sns.set()
%matplotlib inline

In [170]:
# I spent a long time cleaning & concatenating data from different seasons until I stumbled upon this:
# https://github.com/solpaul/fpl-prediction/blob/master/data/train_v5.csv
# this df has data from all seasons since 16-17 by gameweek, with the 19-20 season labeled as 1920, for example, in the "season" col
# thanks solpaul!

# the only thing this dataset is missing is player value on fpl, but from previous analyis, player value wasn't
# an effictive predicting feature, so I'm not worried about it for now. Can always merge it later

def get_previous_seasons_data():
    previous_seasons_data_url = '/Users/andrewpeters/GitHub/fpl/data/external/solpaul-train_v5-311220.csv' #date-code for last time I downloaded this df
    previous_seasons_data = pd.read_csv(previous_seasons_data_url, index_col = 0)
    
    #this dataset has some data from the current (20-21) season, which we don't want
    previous_seasons_data = previous_seasons_data[previous_seasons_data.season != 2021]
    return previous_seasons_data

In [328]:
def get_latest_data():
    #first, pull the ids for all players
    url = 'https://fantasy.premierleague.com/api/bootstrap-static/'
    r = requests.get(url)
    json = r.json()
    elements_df = pd.DataFrame(json['elements']) #probably a more efficient way to this than a df
    
    #using the player ids from elements_df, pull in detailed player info
    # I'm going compile two dataframes -- one that shows gw by gw history for each player (this season), and 
    # another that shows games to be played
    history_df = pd.DataFrame()
    fixtures_df = pd.DataFrame()

    for player in tqdm(elements_df.id):
        url = f'https://fantasy.premierleague.com/api/element-summary/{player}/'
        r = requests.get(url)
        json = r.json()
        player_history_df = pd.DataFrame(json['history'])
        player_fixtures_df = pd.DataFrame(json['fixtures'])
        player_fixtures_df.loc[:, 'element'] = player
        history_df = history_df.append(player_history_df)
        fixtures_df = fixtures_df.append(player_fixtures_df)
    
    return history_df, fixtures_df

In [329]:
previous_season_df = get_previous_seasons_data()

In [466]:
current_season_df, unplayed_fixtures_df = get_latest_data()

100%|██████████| 670/670 [01:52<00:00,  5.95it/s]


In [331]:
#dump data to /raw folder for safekeeping if api breaks
current_season_df.to_pickle('/Users/andrewpeters/GitHub/fpl/data/raw/current_season_df.pkl')
unplayed_fixtures_df.to_pickle('/Users/andrewpeters/GitHub/fpl/data/raw/unplayed_fixtures_df.pkl')

In [332]:
def get_player_info():
    url = 'https://fantasy.premierleague.com/api/bootstrap-static/'
    r = requests.get(url)
    json = r.json()

    #pull player info
    elements_df = pd.DataFrame(json['elements'])
    elements_df['player'] = elements_df['first_name'] + '_' + elements_df['second_name']
    elements_df = elements_df.loc[:, ['id', 'element_type', 'team_code', 'player', 'chance_of_playing_this_round']]
    
    #merge in team names
    teams_df = pd.DataFrame(json['teams'])
    elements_df = elements_df.merge(teams_df[['code', 'name']], left_on = 'team_code', right_on='code')
    
    return elements_df

In [446]:
def append_historical_current_df():

    # merge in some of the basic player info from the elements df
    merged_current_season_df = current_season_df.merge(elements_df, left_on = 'element', right_on = 'id')
    merged_current_season_df = merged_current_season_df.rename(columns={'element_type': 'position',
                                     'round': 'gw',
                                      'name': 'team',
                                      'chance_of_playing_this_round': 'play_proba'
                                     })
    merged_current_season_df['season'] = 2021
    
    #add in opp team names
    merged_current_season_df = merged_current_season_df.merge(teams_df[['id', 'name']], left_on = 'opponent_team', right_on = 'id')
    merged_current_season_df['opponent_team'] = merged_current_season_df['name']

    #append the newly merged current season df with the previous season
    df = merged_current_season_df.append(previous_season_df)
    
    #drop columns that only were in one df or the other prior to appending, and that we're not going to use
    df = df.drop(columns=['fixture', 'value', 'team_code', 'code', 'relative_market_value_team',
            'relative_market_value_opponent_team', 'relative_market_value_team_season',
            'relative_market_value_opponent_team_season', 'name'])

    return df

In [334]:
elements_df = get_player_info()

In [507]:
df = append_historical_current_df()

In [537]:
def prep_unplayed_fixtures():
    #only need to keep some columns
    fixtures = unplayed_fixtures_df[['element', 'team_h', 'team_a', 'event_name', 'kickoff_time', 'finished', 'is_home']]
    
    #games far enough in the future don't have a date assigned, not useful
    fixtures = fixtures.dropna()
    
    #helper function for stripping letters from gameweek column
    def strip_letters(string):
        return re.sub('\D', '', string)
    
    # convert 'Gameweek 25' to just '25' (as an int)
    fixtures.loc[:, 'gw'] = fixtures.apply(lambda x: strip_letters(x['event_name']), axis=1)
    
    #bring in player-level info to each unplayed fixture row
    fixtures = fixtures.merge(df[['element', 'player', 'position', 'team']], how='left', on='element')
    
    #the merge causes many duplicates, because each player has many rows in df
    fixtures = fixtures.drop_duplicates(subset=['element', 'kickoff_time'], keep = 'last')
    
    #make a dictionary using the team ids and team names, and then use that dictionary to fill in the 'team_opponent' col
    # with an intelligible name

    team_dict = dict(zip(teams_df.id,teams_df.name))
    fixtures.loc[fixtures.is_home == True, 'opponent_team'] = fixtures.loc[fixtures.is_home == True, 'team_a'].replace(team_dict)
    fixtures.loc[fixtures.is_home == False, 'opponent_team'] = fixtures.loc[fixtures.is_home == False, 'team_h'].replace(team_dict)
    
    #drop columns we no longer need
    fixtures = fixtures.drop(columns=['team_h', 'team_a', 'event_name'])
    
    return fixtures

In [538]:
unplayed_fixtures = prep_unplayed_fixtures()

In [530]:
def remove_unplayed_fixtures(df):
    #add col to df column to indicate these games have been played -- will be helpful for when i merge unplayed games
    df['finished'] = True
    
    #if a team has completed one game of a double gameweek, the api is pulling both games as having been played, which 
    #reset the 'finished' labels to make them correct
    df['kickoff_time'] = pd.to_datetime(df['kickoff_time'])
    df['finished'] = df.kickoff_time < pd.Timestamp.utcnow()
    
    df = df[df['finished'] == True]
    
    return df

In [531]:
df = remove_unplayed_fixtures(df)

In [532]:
def clean_redundant_cols(df):
    df = df.drop(columns = ['id_x', 'id_y'])
    return df

In [533]:
df = clean_redundant_cols(df)

KeyError: "['id_x' 'id_y'] not found in axis"

In [534]:
df = df.reset_index(drop=True) #reset indices -- some duplicates occur when appending
df.to_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/df.pkl')

In [539]:
unplayed_fixtures = unplayed_fixtures.reset_index(drop=True)
unplayed_fixtures.to_pickle('/Users/andrewpeters/GitHub/fpl/data/interim/unplayed.pkl')