## Create 'moving', merge fixture difficulty ratings, and test forecast models
* Create moving avgs, std deviations, etc for key variables
* Merge FDR onto the dataframe
* Test LSTM, XGBoost, and other models

Bullet points 1&2

In [14]:
import pandas as pd
import numpy as np

In [15]:
def get_latest_gw ():
    '''
    Get the latest GW info and update 'merged.csv' for prediction use 
    '''
    latest = pd.read_csv (r"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2022-23/gws/merged_gw.csv")
    fpl_raw = pd.read_csv (r"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/cleaned_merged_seasons.csv")
    fpl_raw.drop (fpl_raw.columns[[0]], axis=1, inplace=True)
    latest ['season_x'] = "2022-23"
    latest = latest.rename(columns={"team": "team_x"})
    latest = latest.drop(['xP'], axis=1)
    fpl_raw = fpl_raw.drop(['opp_team_name'], axis=1)
    merged = pd.concat ([fpl_raw, latest])
    merged.to_csv ("merged.csv", index=False)
    return 

In [16]:
get_latest_gw ()
merged = pd.read_csv ("merged.csv")
pd.set_option('display.max_columns', None)

  fpl_raw = pd.read_csv (r"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/cleaned_merged_seasons.csv")
  merged = pd.read_csv ("merged.csv")


### FIRST STEPS

In [19]:
def fixture_dict (urls):
    '''
    create a dictionary of fixture id -> FDR; used for merging FDR onto the main dataset
    '''
    ret = dict ()
    season = 18
    for url in urls:
        df = pd.read_csv (url)
        df = df [df ['finished_provisional'] == True].reset_index (drop=True)
        for i in range (len(df)):
            id_home = "20"+str(season)+"-"+str(season+1)+"_home"+str(df['id'][i])
            id_away = "20"+str(season)+"-"+str(season+1)+"_away"+str(df['id'][i])
            val_home = df['team_h_difficulty'][i]
            val_away = df['team_a_difficulty'][i]
            if id_home not in ret:
                ret [id_home] = val_home
            if id_away not in ret:
                ret [id_away] = val_away
        season += 1
    return ret

urls = ['https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2018-19/fixtures.csv', 
        'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2019-20/fixtures.csv',
        'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2020-21/fixtures.csv',
        'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/fixtures.csv',
        'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2022-23/fixtures.csv']

fixture_diff = fixture_dict (urls)

In [20]:
def clean_element (merged):
    ''' 
    change the element to include season data to become a unique identifier
    '''
    elements = []
    for i in range (len(merged)):
        season = merged['season_x'][i]
        id = merged['element'][i]
        elements.append (season+"_"+str(id))
    merged['element'] = elements
    return merged

merged = clean_element (merged)

In [21]:
trimmed = merged [merged ['season_x'] != '2016-17']
trimmed = trimmed [trimmed ['season_x'] != '2017-18']
trimmed = trimmed [trimmed ['season_x'] != '2022-23']
trimmed = trimmed.reset_index (drop = True)

def merge_difficulty (df, fixture_diff):
    difficulty = []
    for i in range (len(df)):
        id = ""
        if (df['was_home'][i]):
            id = df['season_x'][i]+"_home"+str(df['fixture'][i])
        else:
            id = df['season_x'][i]+"_away"+str(df['fixture'][i])
        difficulty.append (fixture_diff[id])

    df ['FDR'] = difficulty
    return df

trimmed= merge_difficulty (trimmed, fixture_diff)

In [22]:
def process (df):

    # clearning, shifting the target, and one-hot encode the position

    df ['target'] = df.groupby('element')['total_points'].shift(-1) # target is next week's score
    df ['upcoming_diff'] = df.groupby('element')['FDR'].shift(-1)
    df = df.drop (['name', 'team_x', 'kickoff_time', 'opponent_team', 'round', 'fixture', 'bonus', 'bps'], axis=1)
    df ['was_home'] = np.multiply (df ['was_home'], 1)

    # one hot encode positions
    df['GK'] = df['position'].apply(lambda x: 1 if x == 'GK' else 0)
    df['DEF'] = df['position'].apply(lambda x: 1 if x == 'DEF' else 0)
    df['MID'] = df['position'].apply(lambda x: 1 if x == 'MID' else 0)
    df['FWD'] = df['position'].apply(lambda x: 1 if x == 'FWD' else 0)

    #create some lagging features to give a sense of momentum
    def create_lags (varname, df):
        prev = varname+"_prev"
        avg3 = varname+"_avg3"
        std5 = varname+"_std5"
        df [prev] = df.groupby ('element')[varname].shift (1)
        df [avg3] = list (df.groupby ('element')[varname].rolling (3).mean ())
        df [std5] = list (df.groupby ('element')[varname].rolling (5).std ())
        return df 
    
    df = create_lags ("minutes", df)
    df = create_lags ("ict_index", df)
    df = create_lags ("total_points", df)
    df = create_lags ("influence", df)
    df = create_lags ("transfers_in", df)
    df = create_lags ("FDR", df)

    df = df.drop (['position'], axis=1)

    return df

In [23]:
trimmed = process (trimmed)

In [24]:
trimmed = trimmed.dropna ().reset_index (drop=True)
trimmed.head ()

Unnamed: 0,season_x,assists,clean_sheets,creativity,element,goals_conceded,goals_scored,ict_index,influence,minutes,own_goals,penalties_missed,penalties_saved,red_cards,saves,selected,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,FDR,target,upcoming_diff,GK,DEF,MID,FWD,minutes_prev,minutes_avg3,minutes_std5,ict_index_prev,ict_index_avg3,ict_index_std5,total_points_prev,total_points_avg3,total_points_std5,influence_prev,influence_avg3,influence_std5,transfers_in_prev,transfers_in_avg3,transfers_in_std5,FDR_prev,FDR_avg3,FDR_std5
0,2018-19,0,0,0.0,2018-19_402,0,0,0.0,0.0,0,0,0,0,0,0,91333,2.0,1.0,0.0,0,-14496,933,15429,55,1,0,2,2,0.0,4.0,0,1,0,0,0.0,60.0,40.249224,0.0,1.2,1.239758,0.0,1.666667,2.774887,0.0,11.733333,8.493056,0.0,2067.666667,2131.460884,5.0,3.666667,1.140175
1,2018-19,0,0,2.3,2018-19_83,3,0,1.4,5.6,90,0,0,0,0,0,21092,3.0,1.0,6.0,2,1983,4229,2246,50,1,0,2,3,2.0,2.0,0,0,1,0,90.0,90.0,40.249224,3.9,1.833333,1.133578,3.0,2.333333,2.792848,10.0,17.266667,8.600698,0.0,1905.333333,2177.915586,2.0,3.666667,1.30384
2,2018-19,0,0,0.0,2018-19_199,0,0,0.0,0.0,0,0,0,0,0,0,174495,1.0,6.0,0.0,0,-24018,1488,25506,55,0,0,2,5,3.0,2.0,0,0,1,0,90.0,90.0,40.249224,3.8,2.266667,1.321741,2.0,3.0,2.701851,20.2,11.533333,8.921883,0.0,3139.666667,1984.986322,4.0,3.0,1.30384
3,2018-19,0,0,4.8,2018-19_14,1,0,2.7,2.4,22,0,0,0,0,0,66721,2.0,3.0,20.0,1,-4393,3565,7958,75,0,0,2,4,2.0,3.0,0,0,1,0,53.0,90.0,0.0,2.9,2.333333,0.91214,1.0,1.0,2.701851,9.4,12.066667,6.244357,0.0,5181.0,2698.796083,4.0,2.666667,1.140175
4,2018-19,0,0,12.3,2018-19_145,2,0,2.9,12.4,74,0,0,0,1,0,1110097,2.0,0.0,4.0,-2,282492,296156,13664,41,1,0,2,4,0.0,3.0,0,1,0,0,90.0,90.0,0.0,6.0,2.3,0.914877,12.0,0.666667,2.345208,46.0,11.466667,4.437116,0.0,9943.666667,6766.483392,2.0,2.666667,0.83666


In [25]:
len (trimmed)

66209

In [26]:
trimmed.columns

Index(['season_x', 'assists', 'clean_sheets', 'creativity', 'element',
       'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'saves', 'selected', 'team_a_score', 'team_h_score', 'threat',
       'total_points', 'transfers_balance', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'yellow_cards', 'GW', 'FDR', 'target',
       'upcoming_diff', 'GK', 'DEF', 'MID', 'FWD', 'minutes_prev',
       'minutes_avg3', 'minutes_std5', 'ict_index_prev', 'ict_index_avg3',
       'ict_index_std5', 'total_points_prev', 'total_points_avg3',
       'total_points_std5', 'influence_prev', 'influence_avg3',
       'influence_std5', 'transfers_in_prev', 'transfers_in_avg3',
       'transfers_in_std5', 'FDR_prev', 'FDR_avg3', 'FDR_std5'],
      dtype='object')

---

## Models

In [27]:
# code copied from feature_const.ipynb for constructing the team given predictions

import copy
combos = []

def n_choose_c (n, c, cur, idx, length):
    copy_cur = copy.deepcopy (cur)
    if (length==c):
        combos.append (copy_cur)
    elif (length>c):
        return
    else:
        temp = copy.deepcopy (copy_cur)
        for j in range (idx, n):
            if (temp==[]):
                temp = [j]
            else:
                temp.append (j)
            n_choose_c (n, c, temp, j+1, len(temp)) 
            temp.pop ()

start = []

n_choose_c (10, 3, start, 0, 0)
combos3 = copy.deepcopy (combos)
combos = []
n_choose_c (10, 4, start, 0, 0)
combos4 = copy.deepcopy (combos)

def legal (attackers, midfielders, defenders, att_index, mid_index, def_index, budget):
    names_set = set ()
    team_count = dict ()
    total_val = 0
    point_total = 0
    names = []
    for idx in att_index:
        team = attackers['team_x'][idx]
        name = attackers['element'][idx]
        if name in names_set:
            return False, names
        else:
            names_set.add (name)
        cost = attackers['value'][idx]
        point_total += attackers['pred'][idx]
        names.append (name)
        total_val += cost
        if (team in team_count):
            team_count [team] +=1
        else:
            team_count [team] = 1
    for idx in mid_index:
        team = midfielders['team_x'][idx]
        name = midfielders['element'][idx]
        if name in names_set:
            return False, names
        else:
            names_set.add (name)
        cost = midfielders['value'][idx]
        point_total += midfielders['pred'][idx]
        names.append (name)
        total_val += cost
        if (team in team_count):
            team_count [team] +=1
        else:
            team_count [team] = 1
    for idx in def_index:
        team = defenders['team_x'][idx]
        name = defenders['element'][idx]
        if name in names_set:
            return False, names
        else:
            names_set.add (name)
        cost = defenders['value'][idx]
        point_total += defenders['pred'][idx]
        names.append (name)
        total_val += cost
        if (team in team_count):
            team_count [team] +=1
        else:
            team_count [team] = 1
    if (total_val>budget):
        return False, names
    else:
        max_players = sorted (list(team_count.values()))[-1]
        if (max_players>3): return False, names
    return point_total, names

def portfolio (df, cap=1000):
    max = -1000
    best_group = []
    # with df (names + preds + other info) -> return the best possible team
    forwards = df [df ['position'] == "FWD"].reset_index ()[:10]
    midfileders = df [df ['position'] == "MID"].reset_index ()[:10]
    defenders = df [df ['position'] == "DEF"][:10].reset_index ()[:10]  
    keepers = df [df ['position'] == "GK"][:10].reset_index ()[:10]
    cap -= keepers ['value'][0]
    count = 1
    for att_index in combos3:
        for mid_index in combos4:
            for def_index in combos3: 
                if (count%10000 == 0):
                    print (count, end = " ") 
                count += 1
                valid, names = legal (forwards, midfileders, defenders, att_index, mid_index, def_index, cap)
                if (valid != False):
                    if (valid>max):
                        max = valid
                        best_group = names
                        best_group.insert (0, keepers['element'][0])

    return max, best_group

In [28]:
def actual_pts (squad, df):
    total = 0
    max_exp = -1
    triple_cap = 0
    captain = None
    for player in squad:
        print (player)
        temp = df [df['element'] == player].reset_index ()
        total += temp ['target'][0]
        if (temp ['pred'][0]>max_exp): # triple cap the player with the maximum expected points
            max_exp = temp ['pred'][0]
            triple_cap = temp ['target'][0]*2
            captain = player
    return captain, total+triple_cap

In [29]:
temp = merged [merged ['season_x'] != '2016-17']
temp = temp [temp ['season_x'] != '2017-18']
temp = temp.reset_index (drop = True)

cur_szn = merge_difficulty (temp, fixture_diff)
cur_szn = process (cur_szn)
cur_szn = cur_szn [cur_szn ['season_x'] == "2022-23"].reset_index (drop = True) # feed in to get predictions
eval_target = cur_szn ['target']
eval = cur_szn.drop (columns = ['season_x', 'target', 'element'], axis=1)

1. RNN

2. Light GBM

In [30]:
train_target = trimmed ['target']
trimmed = trimmed.drop (columns = ['season_x', 'target', 'element'], axis=1)
train = trimmed 

In [31]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

In [238]:
regressor = LGBMRegressor ()
params = {  'n_estimators': [500, 1000],
            'boosting_type': ['dart'],
            'max_depth': [-1, 20, 50],
            'num_leaves' : [20, 50],
            'learning_rate': [0.1],
            'colsample_bytree': [0.8],
            'subsample': [0.8],
            'reg_alpha': [0, 0.4],
            'metric': ['mse'],
            'random_state': [42]}
grid_search = GridSearchCV (regressor, params, cv=5, verbose=10)
grid_search.fit (train, train_target)
    

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START boosting_type=dart, colsample_bytree=0.8, learning_rate=0.1, max_depth=-1, metric=mse, n_estimators=500, num_leaves=20, random_state=42, reg_alpha=0, subsample=0.8
[CV 1/5; 1/24] END boosting_type=dart, colsample_bytree=0.8, learning_rate=0.1, max_depth=-1, metric=mse, n_estimators=500, num_leaves=20, random_state=42, reg_alpha=0, subsample=0.8;, score=0.245 total time=  15.0s
[CV 2/5; 1/24] START boosting_type=dart, colsample_bytree=0.8, learning_rate=0.1, max_depth=-1, metric=mse, n_estimators=500, num_leaves=20, random_state=42, reg_alpha=0, subsample=0.8
[CV 2/5; 1/24] END boosting_type=dart, colsample_bytree=0.8, learning_rate=0.1, max_depth=-1, metric=mse, n_estimators=500, num_leaves=20, random_state=42, reg_alpha=0, subsample=0.8;, score=0.256 total time=  17.7s
[CV 3/5; 1/24] START boosting_type=dart, colsample_bytree=0.8, learning_rate=0.1, max_depth=-1, metric=mse, n_estimators=500, num_leaves

In [247]:
grid_search.best_params_

{'boosting_type': 'dart',
 'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': -1,
 'metric': 'mse',
 'n_estimators': 500,
 'num_leaves': 20,
 'random_state': 42,
 'reg_alpha': 0,
 'subsample': 0.8}

In [248]:
grid_search.best_score_

0.2790063719152295

In [32]:
best_params = {'boosting_type': 'dart',
                'colsample_bytree': 0.8,
                'learning_rate': 0.1,
                'max_depth': -1,
                'metric': 'mse',
                'n_estimators': 500,
                'num_leaves': 20,
                'random_state': 42,
                'reg_alpha': 0,
                'subsample': 0.8}

best_estimator = LGBMRegressor (**best_params)
best_estimator.fit (train, train_target)

In [34]:
predicted_pts = best_estimator.predict (eval)

In [35]:
merged ['target'] = merged.groupby ('element')['total_points'].shift (-1)
gw5 = merged [merged ['season_x'] == "2022-23"]
gw5 ['pred'] = predicted_pts
gw5 = gw5[gw5 ['GW'] == 5].reset_index (drop=True)
gw5 = gw5.sort_values (by='pred', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gw5 ['pred'] = predicted_pts


In [36]:
# deduction - pick four cheapest players
temp = gw5.sort_values (by='value')
deduct = temp[:4]['value'].sum ()
cap = 1000 - deduct

In [257]:
max_pred_pts, ids = portfolio (gw5, cap)

10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 210000 220000 230000 240000 250000 260000 270000 280000 290000 300000 310000 320000 330000 340000 350000 360000 370000 380000 390000 400000 410000 420000 430000 440000 450000 460000 470000 480000 490000 500000 510000 520000 530000 540000 550000 560000 570000 580000 590000 600000 610000 620000 630000 640000 650000 660000 670000 680000 690000 700000 710000 720000 730000 740000 750000 760000 770000 780000 790000 800000 810000 820000 830000 840000 850000 860000 870000 880000 890000 900000 910000 920000 930000 940000 950000 960000 970000 980000 990000 1000000 1010000 1020000 1030000 1040000 1050000 1060000 1070000 1080000 1090000 1100000 1110000 1120000 1130000 1140000 1150000 1160000 1170000 1180000 1190000 1200000 1210000 1220000 1230000 1240000 1250000 1260000 1270000 1280000 1290000 1300000 1310000 1320000 1330000 1340000 1350000 1360000 1370000 1380000 1390

In [263]:
ids_names = pd.read_csv ('https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2022-23/player_idlist.csv')
ids_dict = dict ()
for i in range (len(ids_names)):
    ids_dict ["2022-23_"+str(ids_names['id'][i])] = ids_names['first_name'][i] + " " + ids_names['second_name'][i]

In [352]:
for id in ids:
    print (ids_dict[id])

Ederson Santana de Moraes
Harry Kane
Roberto Firmino
Gabriel Fernando de Jesus
Mohamed Salah
Raheem Sterling
Bernardo Veiga de Carvalho e Silva
Pascal Groß
Rúben Gato Alves Dias
Kyle Walker
Ben Mee


In [354]:
def find_captain (ids, df):
    captain = None
    max = -1
    for player in ids:
        player_df = df [df['element']==player].reset_index (drop=True)
        pred = player_df['pred'][0]
        if (pred>max):
            max=pred
            captain=ids_dict [player]
    return captain, max

find_captain (ids, gw5)

('Mohamed Salah', 6.633658796854616)

In [355]:
2+9+1+9+1+2+3+5+2+2+1 # not terrible; low scoring GW

37