In [1]:
import glob
import os
from datetime import datetime as dt

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from helpers import build_season_mv


In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', -1)


In [3]:
data_path = '/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/'
season_folders = glob.glob(data_path + '*')
season_names = [x.replace(data_path, '') for x in season_folders]
this_season = sorted(season_names)[-1]

# Summary

## Features 

- Player
- Position
- Team
- Opposition
- Home/away
- Gameweek
- Season
- Minutes
- Team market value at start of season (relative to all teams)
- Opposition market value at start of season (relative to all teams)

## Target

- Total_points


## More ideas

- Smooth data


# Update github data

In [4]:
! git -C /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data pull

print('last updated: ')
! git -C /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data show --no-patch --no-notes --pretty='%cd'


Already up to date.
last updated: 
Mon Jun 22 17:24:50 2020 -0700


# Get fixtures for each year

## From fixture table

In [5]:
fixtures_link = {'2019-20' : 'https://fixturedownload.com/download/epl-2019-GMTStandardTime.csv'
                ,'2018-19' : 'https://fixturedownload.com/download/epl-2018-GMTStandardTime.csv'
                ,'2017-18' : 'https://fixturedownload.com/download/epl-2017-GMTStandardTime.csv'
                ,'2016-17' : 'https://fixturedownload.com/download/epl-2016-GMTStandardTime.csv'}

In [6]:
# Open table

Date_format = lambda x: pd.datetime.strptime(x, "%d/%m/%Y %H:%M")

data_merge = []

for i in fixtures_link:
    
    link = fixtures_link[i]
    
    import_df = pd.read_csv(link,parse_dates = ['Date'], date_parser = Date_format)
    
    import_df['Season'] = i
    
    data_merge.append(import_df)
    
# Split scores into home and away columns 
fixtures_RAW = pd.concat(data_merge)
fixtures_RAW[['home_score' , 'away_score']] = fixtures_RAW['Result'].str.split(' - ', expand = True)

fixtures_RAW['home_score'] = pd.to_numeric(fixtures_RAW['home_score'])
fixtures_RAW['away_score'] = pd.to_numeric(fixtures_RAW['away_score'])

fixtures_RAW.sort_values('Date').tail()

Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Result,Season,home_score,away_score
371,38,2020-07-26 15:00:00,Turf Moor,Burnley,Brighton,,2019-20,,
370,38,2020-07-26 15:00:00,Emirates Stadium,Arsenal,Watford,,2019-20,,
378,38,2020-07-26 15:00:00,St. Mary's Stadium,Southampton,Sheffield Utd,,2019-20,,
372,38,2020-07-26 15:00:00,Stamford Bridge,Chelsea,Wolves,,2019-20,,
377,38,2020-07-26 15:00:00,St. James' Park,Newcastle,Liverpool,,2019-20,,


In [7]:
# Transform shape of table to order by team + date
merge = []

# Split into individual seasons
for season in fixtures_RAW['Season'].unique():
    
    season_df = fixtures_RAW.loc[fixtures_RAW['Season'] == season]
    

    for team in season_df['Home Team'].unique():
        
        team_record = season_df.loc[(season_df['Home Team'] == team) | (season_df['Away Team'] == team)]

        col_team = np.full(38, team)
        col_GWs = np.arange(1,39)
        col_date = team_record['Date'].dt.date
        col_Opponent = np.where(team_record['Home Team'] == team, team_record['Away Team'],team_record['Home Team'])
        col_home = np.where((team_record['Home Team'] == team), 1 , 0)
        col_goals_marg = np.where((team_record['Home Team'] == team), team_record['home_score'], team_record['away_score'])
        col_goals_cum  = np.cumsum(col_goals_marg)
        col_opp_goals = np.where((team_record['Home Team'] == team), team_record['away_score'], team_record['home_score'])
        col_points_marg = np.where(col_goals_marg > col_opp_goals, 3, 
                                     np.where(col_goals_marg == col_opp_goals, 1, col_goals_marg))
        col_points_cum = np.cumsum(col_points_marg)

        
        team_record = pd.DataFrame({'FIX_season' : season
                                   ,'FIX_team':col_team
                                   ,'FIX_GW':col_GWs
                                   ,'FIX_Fixture_date':col_date
                                   ,'FIX_Opponent':col_Opponent
                                   ,'FIX_Home?':col_home
                                   ,'FIX_goals_marginal' : col_goals_marg
                                   ,'FIX_goals_cumulative':col_goals_cum
                                   ,'FIX_opp_goals' : col_opp_goals
                                   ,'FIX_points_marginal' : col_points_marg
                                   ,'FIX_points_cumulative' : col_points_cum })

        merge.append(team_record)
    
fixtures_df = pd.concat(merge).reset_index(drop = True)    




## Compute cumulative points and goals

In [8]:
# Copy and merge to get cumulative points and goals for opponents

opponent_points_cumulative = fixtures_df[['FIX_team','FIX_Fixture_date','FIX_goals_cumulative','FIX_points_cumulative']].copy()


opponent_points_cumulative = opponent_points_cumulative.rename(columns = {'FIX_team' : 'FIX_Opponent',
                                                                          'FIX_goals_cumulative' : 'FIX_opponent_goals_cumulative',
                                                                          'FIX_points_cumulative' : 'FIX_opponent_points_cumulative'})
fixtures_df = fixtures_df.merge(opponent_points_cumulative)

In [9]:
# Shift by one week for incoming points total 
fixtures_df['INCOMING_team_points'] = fixtures_df['FIX_points_cumulative'].shift(1)
fixtures_df['INCOMING_opponent_points'] = fixtures_df['FIX_opponent_points_cumulative'].shift(1)

# Shift by one week for incoming goal totals
fixtures_df['INCOMING_team_goals'] = fixtures_df['FIX_goals_cumulative'].shift(1)
fixtures_df['INCOMING_opponent_goals'] = fixtures_df['FIX_opponent_goals_cumulative'].shift(1)

# Calculate ratio between team/opponent goals/points
fixtures_df['FIX_points_diff'] = fixtures_df['INCOMING_team_points']-fixtures_df['INCOMING_opponent_points']
fixtures_df['FIX_goals_diff'] = fixtures_df['INCOMING_team_goals']-fixtures_df['INCOMING_opponent_goals']

In [10]:
fixtures_df.head()

Unnamed: 0,FIX_season,FIX_team,FIX_GW,FIX_Fixture_date,FIX_Opponent,FIX_Home?,FIX_goals_marginal,FIX_goals_cumulative,FIX_opp_goals,FIX_points_marginal,FIX_points_cumulative,FIX_opponent_goals_cumulative,FIX_opponent_points_cumulative,INCOMING_team_points,INCOMING_opponent_points,INCOMING_team_goals,INCOMING_opponent_goals,FIX_points_diff,FIX_goals_diff
0,2019-20,Liverpool,1,2019-08-09,Norwich,1,4.0,4.0,1.0,3.0,3.0,1.0,1.0,,,,,,
1,2019-20,Liverpool,2,2019-08-17,Southampton,0,2.0,6.0,1.0,3.0,6.0,1.0,1.0,3.0,1.0,4.0,1.0,2.0,3.0
2,2019-20,Liverpool,3,2019-08-24,Arsenal,1,3.0,9.0,1.0,3.0,9.0,4.0,7.0,6.0,1.0,6.0,1.0,5.0,5.0
3,2019-20,Liverpool,4,2019-08-31,Burnley,0,3.0,12.0,0.0,3.0,12.0,5.0,5.0,9.0,7.0,9.0,4.0,2.0,5.0
4,2019-20,Liverpool,5,2019-09-14,Newcastle,1,3.0,15.0,1.0,3.0,15.0,4.0,6.0,12.0,5.0,12.0,5.0,7.0,7.0


## Map teams to Vaastav IDs

In [11]:
team_ids = pd.read_csv('/Users/calumthompson/Documents/Fantasy_football/Cals-FF-model/Soloman method/team_ids.csv'
                      ,usecols = ['team','team_1617','team_1718','team_1819','team_1920']  )

team_ids = team_ids.rename(columns = {'team_1617': '2016-17',
                                      'team_1718': '2017-18',
                                      'team_1819': '2018-19',
                                      'team_1920': '2019-20'})

team_ids = team_ids.melt(id_vars = 'team',
                         var_name = 'FIX_season',
                         value_name = 'FIX_team_code')

team_ids['FIX_team_code'] = pd.to_numeric(team_ids['FIX_team_code'])

team_ids = team_ids.rename(columns = {'team' : 'FIX_team'})

team_ids.loc[team_ids['FIX_team'] == 'West Ham United', 'FIX_team'] = 'West Ham'
team_ids.loc[team_ids['FIX_team'] == 'Tottenham Hotspur', 'FIX_team'] = 'Spurs'
team_ids.loc[team_ids['FIX_team'] == 'Leicester City', 'FIX_team'] = 'Leicester'
team_ids.loc[team_ids['FIX_team'] == 'Newcastle United', 'FIX_team'] = 'Newcastle'
team_ids.loc[team_ids['FIX_team'] == 'Manchester City', 'FIX_team'] = 'Man City'
team_ids.loc[team_ids['FIX_team'] == 'Manchester United', 'FIX_team'] = 'Man Utd'
team_ids.loc[team_ids['FIX_team'] == 'Brighton and Hove Albion', 'FIX_team'] = 'Brighton'
team_ids.loc[team_ids['FIX_team'] == 'Sheffield United', 'FIX_team'] = 'Sheffield Utd'
team_ids.loc[team_ids['FIX_team'] == 'Wolverhampton Wanderers', 'FIX_team'] = 'Wolves'

team_ids.loc[team_ids['FIX_team'] == 'Huddersfield Town', 'FIX_team'] = 'Huddersfield'
team_ids.loc[team_ids['FIX_team'] == 'Cardiff City', 'FIX_team'] = 'Cardiff'
team_ids.loc[team_ids['FIX_team'] == 'West Bromwich Albion', 'FIX_team'] = 'West Brom'
team_ids.loc[team_ids['FIX_team'] == 'Swansea City', 'FIX_team'] = 'Swansea'

team_ids.loc[team_ids['FIX_team'] == 'Stoke City', 'FIX_team'] = 'Stoke'
team_ids.loc[team_ids['FIX_team'] == 'Hull City', 'FIX_team'] = 'Hull'

team_ids.sort_values(['FIX_team', 'FIX_season']).head()

Unnamed: 0,FIX_team,FIX_season,FIX_team_code
0,Arsenal,2016-17,1.0
29,Arsenal,2017-18,1.0
58,Arsenal,2018-19,1.0
87,Arsenal,2019-20,1.0
26,Aston Villa,2016-17,


In [12]:
opponent_ids = team_ids.copy()
opponent_ids = opponent_ids.rename(columns = {'FIX_team' : 'FIX_opponent_team',
                                              'FIX_team_code' : 'FIX_opponent_code'}) 

In [13]:
opponent_ids.head()

Unnamed: 0,FIX_opponent_team,FIX_season,FIX_opponent_code
0,Arsenal,2016-17,1.0
1,Bournemouth,2016-17,2.0
2,Burnley,2016-17,3.0
3,Chelsea,2016-17,4.0
4,Crystal Palace,2016-17,5.0


In [14]:
fixtures_df = pd.merge(fixtures_df, team_ids, how = 'left',
         left_on = ['FIX_season','FIX_team'] , right_on = ['FIX_season', 'FIX_team'])

fixtures_df = pd.merge(fixtures_df, opponent_ids, how = 'left',
         left_on = ['FIX_season','FIX_Opponent'] , right_on = ['FIX_season', 'FIX_opponent_team'])

In [15]:
fixtures_df.head()

Unnamed: 0,FIX_season,FIX_team,FIX_GW,FIX_Fixture_date,FIX_Opponent,FIX_Home?,FIX_goals_marginal,FIX_goals_cumulative,FIX_opp_goals,FIX_points_marginal,FIX_points_cumulative,FIX_opponent_goals_cumulative,FIX_opponent_points_cumulative,INCOMING_team_points,INCOMING_opponent_points,INCOMING_team_goals,INCOMING_opponent_goals,FIX_points_diff,FIX_goals_diff,FIX_team_code,FIX_opponent_team,FIX_opponent_code
0,2019-20,Liverpool,1,2019-08-09,Norwich,1,4.0,4.0,1.0,3.0,3.0,1.0,1.0,,,,,,,10.0,Norwich,14.0
1,2019-20,Liverpool,2,2019-08-17,Southampton,0,2.0,6.0,1.0,3.0,6.0,1.0,1.0,3.0,1.0,4.0,1.0,2.0,3.0,10.0,Southampton,16.0
2,2019-20,Liverpool,3,2019-08-24,Arsenal,1,3.0,9.0,1.0,3.0,9.0,4.0,7.0,6.0,1.0,6.0,1.0,5.0,5.0,10.0,Arsenal,1.0
3,2019-20,Liverpool,4,2019-08-31,Burnley,0,3.0,12.0,0.0,3.0,12.0,5.0,5.0,9.0,7.0,9.0,4.0,2.0,5.0,10.0,Burnley,5.0
4,2019-20,Liverpool,5,2019-09-14,Newcastle,1,3.0,15.0,1.0,3.0,15.0,4.0,6.0,12.0,5.0,12.0,5.0,7.0,7.0,10.0,Newcastle,13.0


# Get player list for each year

## From GW data

In [16]:
GW_address = '/gws/*'
player_path =  dict(zip(season_names,[x + GW_address for x in season_folders]))

In [17]:
data_merge = []

for season in player_path:
    
    path = player_path[season]
    
    print("Load: ", path)
    
    for fname in glob.glob(path):
        
        if "merged_gw" not in fname:
            
            import_df = pd.read_csv(fname,encoding = "latin1", parse_dates = ['kickoff_time'])

            if import_df.shape[0] > 0 :
                
#                 print("Load: ", fname)
                import_df['FIX_Fixture_date'] = import_df['kickoff_time'].dt.date 

                import_df['file'] = fname
                import_df['Season'] = season
                import_df['name'] = import_df['name'].str.replace('_\d+', '')

                data_merge.append(import_df)

player_gws = pd.concat(data_merge).sort_values(['name','FIX_Fixture_date'])
player_gws.tail()

Load:  /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2017-18/gws/*
Load:  /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2016-17/gws/*
Load:  /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/*
Load:  /Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/*


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,FIX_Fixture_date,Season,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,creativity,dribbles,ea_index,element,errors_leading_to_goal,errors_leading_to_goal_attempt,file,fixture,fouls,goals_conceded,goals_scored,ict_index,id,influence,...,open_play_crosses,opponent_team,own_goals,penalties_conceded,penalties_missed,penalties_saved,recoveries,red_cards,round,saves,selected,tackled,tackles,target_missed,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,winning_goals,yellow_cards
672,2019-04-12,2018-19,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,515,0.0,0.0,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/gw34.csv,335,0.0,0,0,0.0,18973.0,0.0,...,0.0,15,0,0.0,0,0,0.0,0,34,0,2384,0.0,0.0,0.0,1.0,0.0,0.0,0,27,43,16,48,True,0.0,0
857,2019-04-20,2018-19,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,515,0.0,0.0,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/gw35.csv,349,0.0,0,0,0.0,19793.0,0.0,...,0.0,19,0,0.0,0,0,0.0,0,35,0,2383,0.0,0.0,0.0,2.0,2.0,0.0,0,-17,13,30,48,False,0.0,0
614,2019-04-28,2018-19,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,515,0.0,0.0,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/gw36.csv,355,0.0,0,0,0.0,20451.0,0.0,...,0.0,1,0,0.0,0,0,0.0,0,36,0,2393,0.0,0.0,0.0,0.0,3.0,0.0,0,-3,10,13,48,True,0.0,0
615,2019-05-06,2018-19,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,515,0.0,0.0,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/gw37.csv,367,0.0,0,0,0.0,21066.0,0.0,...,0.0,13,0,0.0,0,0,0.0,0,37,0,2391,0.0,0.0,0.0,0.0,1.0,0.0,0,-13,4,17,48,False,0.0,0
623,2019-05-12,2018-19,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,515,0.0,0.0,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/gws/gw38.csv,375,0.0,0,0,0.0,21682.0,0.0,...,0.0,6,0,0.0,0,0,0.0,0,38,0,2405,0.0,0.0,0.0,0.0,0.0,0.0,0,-1,11,12,48,True,0.0,0


## Add player positions
1 = GK 2 = DEF 3 = MID 4 = FWD

In [18]:
player_raw_address = '/players_raw.csv'
position_data =  dict(zip(season_names,[x + player_raw_address for x in season_folders]))

In [19]:
data_merge = []

for season in position_data:
    
    path = position_data[season]
    print(path)

    import_df = pd.read_csv(path,encoding = "ISO-8859-1", usecols = ['id','element_type'])
    import_df['Season'] = season

    data_merge.append(import_df)
    
positions_df = pd.concat(data_merge)

player_df = pd.merge(left = player_gws, right = positions_df , how = 'left',left_on = ['element','Season'], right_on = ['id', 'Season'])

player_df = player_df.rename(columns = {'element_type': 'position',
                                        'opponent_team' : 'FIX_opponent_code'})

player_df.head()

/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2017-18/players_raw.csv
/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2016-17/players_raw.csv
/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/players_raw.csv
/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2018-19/players_raw.csv


Unnamed: 0,FIX_Fixture_date,Season,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,creativity,dribbles,ea_index,element,errors_leading_to_goal,errors_leading_to_goal_attempt,file,fixture,fouls,goals_conceded,goals_scored,ict_index,id_x,influence,...,own_goals,penalties_conceded,penalties_missed,penalties_saved,recoveries,red_cards,round,saves,selected,tackled,tackles,target_missed,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,winning_goals,yellow_cards,position,id_y
0,2019-08-31,2019-20,0,,,,0,1,0,,,0.1,,,534,,,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/gw4.csv,37,,1,0,0.0,,0.2,...,0,,0,0,,0,4,0,0,,,,0.0,4.0,0.0,1,0,0,0,45,False,,0,4,534
1,2019-09-14,2019-20,0,,,,0,1,0,,,0.3,,,534,,,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/gw5.csv,43,,1,0,2.2,,1.0,...,0,,0,0,,0,5,0,14029,,,,1.0,1.0,21.0,1,10589,13500,2911,45,True,,0,4,534
2,2019-09-21,2019-20,0,,,,0,1,0,,,4.8,,,534,,,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/gw6.csv,58,,0,0,2.5,,2.0,...,0,,0,0,,0,6,0,22804,,,,0.0,0.0,18.0,1,8090,11749,3659,45,False,,0,4,534
3,2019-09-28,2019-20,0,,,,0,2,0,,,0.6,,,534,,,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/gw7.csv,63,,1,0,0.1,,0.2,...,0,,0,0,,0,7,0,32699,,,,0.0,2.0,0.0,1,8437,13439,5002,45,False,,0,4,534
4,2019-10-05,2019-20,0,,,,3,53,1,,,23.8,,,534,,,/Users/calumthompson/Documents/Fantasy_football/GitHub_data/data/2019-20/gws/gw8.csv,72,,0,2,20.1,,70.2,...,0,,0,0,,0,8,0,35026,,,,0.0,3.0,107.0,13,2156,5952,3796,45,True,,0,4,534


# Merge

In [20]:
dataset_df = pd.merge(fixtures_df, player_df
                    , on = ['FIX_opponent_code','FIX_Fixture_date'], how = 'left')

dataset_df['FIX_Fixture_date'] = pd.to_datetime(dataset_df['FIX_Fixture_date'])

# Downfill teams for matches that have yet to take place

In [21]:
# Identify which games need downfilling
future_games = dataset_df.loc[(dataset_df['name'].isna() == True) & (dataset_df['FIX_season'] == this_season)]
past_games   = dataset_df.loc[(dataset_df['name'].isna() == False)]

# Mark games that have already taken place
past_games['forecast'] = 0 

# Get most recent fixtures for each team
downfill_teams = past_games.groupby('FIX_team')['FIX_Fixture_date'].max()

# Get most recent lineups
merge = []
for team, fixture in downfill_teams.iteritems():
    merge.append(past_games.loc[(past_games['FIX_team'] == team) & (past_games['FIX_Fixture_date'] == fixture)])
    
# Only want columns not provided by fixtures_df
team_lineups = pd.concat(merge).drop(columns = fixtures_df.columns.drop('FIX_team'))

# Create df with team and gw to fill, matched with most recent team line up
downfill_df = pd.merge(future_games[fixtures_df.columns], team_lineups, on = 'FIX_team', how = 'left')

# Mark future games with 2....
downfill_df['forecast'] = 2

# ... and 1 for next game
downfill_df.loc[downfill_df.groupby(['name'])['forecast'].head(1).index,'forecast'] = 1


# Add to data 
dataset_df = pd.concat([past_games,downfill_df],sort=False)
dataset_df = dataset_df.sort_values(['name','FIX_Fixture_date'])

# Frontfill nulls
dataset_df = dataset_df.groupby(['name','FIX_season']).ffill()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
dataset_df.head()

Unnamed: 0,name,FIX_season,FIX_team,FIX_GW,FIX_Fixture_date,FIX_Opponent,FIX_Home?,FIX_goals_marginal,FIX_goals_cumulative,FIX_opp_goals,FIX_points_marginal,FIX_points_cumulative,FIX_opponent_goals_cumulative,FIX_opponent_points_cumulative,INCOMING_team_points,INCOMING_opponent_points,INCOMING_team_goals,INCOMING_opponent_goals,FIX_points_diff,FIX_goals_diff,FIX_team_code,FIX_opponent_team,FIX_opponent_code,Season,assists,...,penalties_conceded,penalties_missed,penalties_saved,recoveries,red_cards,round,saves,selected,tackled,tackles,target_missed,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,winning_goals,yellow_cards,position,id_y,forecast
10720,Aaron_Connolly,2019-20,Brighton,4,2019-08-31,Man City,0,0.0,4.0,4.0,0.0,4.0,14.0,10.0,4.0,4.0,4.0,3.0,0.0,1.0,4.0,Man City,11.0,2019-20,0.0,...,,0.0,0.0,,0.0,4.0,0.0,0.0,,,,0.0,4.0,0.0,1.0,0.0,0.0,0.0,45.0,False,,0.0,4.0,534.0,0
10747,Aaron_Connolly,2019-20,Brighton,5,2019-09-14,Burnley,1,1.0,5.0,1.0,1.0,5.0,6.0,6.0,4.0,10.0,4.0,14.0,-6.0,-10.0,4.0,Burnley,5.0,2019-20,0.0,...,,0.0,0.0,,0.0,5.0,0.0,14029.0,,,,1.0,1.0,21.0,1.0,10589.0,13500.0,2911.0,45.0,True,,0.0,4.0,534.0,0
10776,Aaron_Connolly,2019-20,Brighton,6,2019-09-21,Newcastle,0,0.0,5.0,0.0,1.0,6.0,4.0,7.0,5.0,6.0,5.0,6.0,-1.0,-1.0,4.0,Newcastle,13.0,2019-20,0.0,...,,0.0,0.0,,0.0,6.0,0.0,22804.0,,,,0.0,0.0,18.0,1.0,8090.0,11749.0,3659.0,45.0,False,,0.0,4.0,534.0,0
10805,Aaron_Connolly,2019-20,Brighton,7,2019-09-28,Chelsea,0,0.0,5.0,2.0,0.0,6.0,14.0,12.0,6.0,7.0,5.0,4.0,-1.0,1.0,4.0,Chelsea,6.0,2019-20,0.0,...,,0.0,0.0,,0.0,7.0,0.0,32699.0,,,,0.0,2.0,0.0,1.0,8437.0,13439.0,5002.0,45.0,False,,0.0,4.0,534.0,0
10835,Aaron_Connolly,2019-20,Brighton,8,2019-10-05,Spurs,1,3.0,8.0,0.0,3.0,9.0,14.0,12.0,6.0,12.0,5.0,14.0,-6.0,-9.0,4.0,Spurs,17.0,2019-20,0.0,...,,0.0,0.0,,0.0,8.0,0.0,35026.0,,,,0.0,3.0,107.0,13.0,2156.0,5952.0,3796.0,45.0,True,,0.0,4.0,534.0,0


# Calculate final variables

## Player value today

In [23]:
value_df = pd.DataFrame(dataset_df.sort_values(['FIX_Fixture_date']).groupby('name').last()['value']).reset_index()

value_df = value_df.rename(columns = {'value' : 'cost_today'})

dataset_df = pd.merge(dataset_df, value_df, how = 'left', on = 'name')

## Player recent performance


In [24]:
def generate_performance_col (df, field):
    df['LW_' + str(field)] = df.groupby(['FIX_season','name'])[field].shift(periods = 1 )
    df['L4W_' + str(field)] = df.groupby(['FIX_season','name'])['LW_' + str(field)].rolling(4, min_periods = 1).mean().reset_index(level=['FIX_season','name'], drop = True)
    df['TSS_' + str(field)] = df.groupby(['FIX_season','name'])['LW_' + str(field)].rolling(38, min_periods = 1).mean().reset_index(level=['FIX_season','name'], drop = True)
    df['EWMA_' + str(field)] = df.groupby(['FIX_season','name'])['LW_' + str(field)].transform(lambda x: x.ewm(halflife=10).mean())

    
    # Nulls for future games....
    df['L4W_' + str(field)] = np.where(df['forecast'] == 2, np.nan, df['L4W_' + str(field)])
    df['TSS_' + str(field)] = np.where(df['forecast'] == 2, np.nan, df['TSS_' + str(field)])
    df['EWMA_' + str(field)] = np.where(df['forecast'] == 2, np.nan, df['EWMA_' + str(field)])
    
    # then downfill
    df = df.groupby(['name','FIX_season']).ffill()
    
    return df

In [25]:
performance_cols = ['total_points',
                    'goals_scored',
                    'minutes',
                    'bps',
                    'ict_index',
                    'influence',
                    'creativity',
                    'threat',
                    'clean_sheets']

for col in performance_cols:
    print("Generating: ", col)
    dataset_df = generate_performance_col(dataset_df, col)
    
dataset_df.head()

Generating:  total_points
Generating:  goals_scored
Generating:  minutes
Generating:  bps
Generating:  ict_index
Generating:  influence
Generating:  creativity
Generating:  threat
Generating:  clean_sheets


Unnamed: 0,name,FIX_season,FIX_team,FIX_GW,FIX_Fixture_date,FIX_Opponent,FIX_Home?,FIX_goals_marginal,FIX_goals_cumulative,FIX_opp_goals,FIX_points_marginal,FIX_points_cumulative,FIX_opponent_goals_cumulative,FIX_opponent_points_cumulative,INCOMING_team_points,INCOMING_opponent_points,INCOMING_team_goals,INCOMING_opponent_goals,FIX_points_diff,FIX_goals_diff,FIX_team_code,FIX_opponent_team,FIX_opponent_code,Season,assists,...,EWMA_minutes,LW_bps,L4W_bps,TSS_bps,EWMA_bps,LW_ict_index,L4W_ict_index,TSS_ict_index,EWMA_ict_index,LW_influence,L4W_influence,TSS_influence,EWMA_influence,LW_creativity,L4W_creativity,TSS_creativity,EWMA_creativity,LW_threat,L4W_threat,TSS_threat,EWMA_threat,LW_clean_sheets,L4W_clean_sheets,TSS_clean_sheets,EWMA_clean_sheets
0,Aaron_Connolly,2019-20,Brighton,4,2019-08-31,Man City,0,0.0,4.0,4.0,0.0,4.0,14.0,10.0,4.0,4.0,4.0,3.0,0.0,1.0,4.0,Man City,11.0,2019-20,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,Aaron_Connolly,2019-20,Brighton,5,2019-09-14,Burnley,1,1.0,5.0,1.0,1.0,5.0,6.0,6.0,4.0,10.0,4.0,14.0,-6.0,-10.0,4.0,Burnley,5.0,2019-20,0.0,...,24.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.1,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aaron_Connolly,2019-20,Brighton,6,2019-09-21,Newcastle,0,0.0,5.0,0.0,1.0,6.0,4.0,7.0,5.0,6.0,5.0,6.0,-1.0,-1.0,4.0,Newcastle,13.0,2019-20,0.0,...,14.688209,1.0,1.0,1.0,1.0,2.2,1.1,1.1,1.138108,1.0,0.6,0.6,0.613857,0.3,0.2,0.2,0.203464,21.0,10.5,10.5,10.863757,0.0,0.0,0.0,0.0
3,Aaron_Connolly,2019-20,Brighton,7,2019-09-28,Chelsea,0,0.0,5.0,2.0,0.0,6.0,14.0,12.0,6.0,7.0,5.0,4.0,-1.0,1.0,4.0,Chelsea,6.0,2019-20,0.0,...,16.582852,1.0,1.0,1.0,1.0,2.5,1.566667,1.566667,1.623876,2.0,1.066667,1.066667,1.108276,4.8,1.733333,1.733333,1.842986,18.0,13.0,13.0,13.409157,0.0,0.0,0.0,0.0
4,Aaron_Connolly,2019-20,Brighton,8,2019-10-05,Spurs,1,3.0,8.0,0.0,3.0,9.0,14.0,12.0,6.0,12.0,5.0,14.0,-6.0,-9.0,4.0,Spurs,17.0,2019-20,0.0,...,18.910709,2.0,1.25,1.25,1.276561,0.1,1.2,1.2,1.202431,0.2,0.85,0.85,0.857082,0.6,1.45,1.45,1.499224,0.0,9.75,9.75,9.700704,0.0,0.0,0.0,0.0


# Export

In [26]:
dataset_df['FIX_season'].unique()

array(['2019-20', '2016-17', '2017-18', '2018-19'], dtype=object)

In [27]:
dataset_df.to_csv('Data/Input_data/' + str(dt.now()) + '.csv', index = False)