In [12]:
# import what we need
import pandas as pd
import numpy as np
import os

#  preprocessing
from sklearn.impute import SimpleImputer

latest_db_file = "whole_db_26_09_2023.csv"

# pandas settings
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

In [13]:
# FIXTURES
gw_features = [
    'full_name', 
    "season", 
    'position', 
    'was_home', 
    'round', 
    "gw_total_points", 
    "total_points_last_season",
    'assists',
    'bonus',
    'bps',
    'creativity',
    'clean_sheets',
    "saves",
    'goals_conceded',
    'goals_scored',
    'ict_index',
    'influence',
    'minutes',
    'threat',
    "expected_assists", 
    "expected_goals", 
    "expected_goals_conceded"
]

In [14]:
# load the db
raw_db = pd.read_csv(os.path.join("data/db_tables", latest_db_file))
raw_db_copy = raw_db.copy()

In [15]:
raw_db_copy

Unnamed: 0,full_name,element,season,position,player_team_name,round,kickoff_time,opponent_team_name,was_home,team_h_score,team_a_score,gw_total_points,total_points_last_season,expected_assists,expected_goals,expected_goals_conceded,assists,bonus,bps,creativity,clean_sheets,saves,goals_conceded,goals_scored,ict_index,influence,minutes,threat
0,aaron_cresswell,402,1819,DEF,West Ham,1,2018-08-12T12:30:00Z,Liverpool,False,4.0,0.0,0,34.0,,,,0,0,0,0.0,0,0,0,0,0.0,0.0,0,0.0
1,aaron_lennon,83,1819,MID,Burnley,1,2018-08-12T12:30:00Z,Southampton,False,0.0,0.0,3,41.0,,,,0,0,6,12.3,1,0,0,0,3.9,10.0,90,17.0
2,aaron_mooy,199,1819,MID,Huddersfield,1,2018-08-11T14:00:00Z,Chelsea,True,0.0,3.0,2,76.0,,,,0,0,24,18.2,0,0,3,0,3.8,20.2,90,0.0
3,aaron_ramsey,14,1819,MID,Arsenal,1,2018-08-12T15:00:00Z,Man City,True,0.0,2.0,1,91.0,,,,0,0,7,10.8,0,0,1,0,2.9,9.4,53,9.0
4,aaron_wan-bissaka,145,1819,DEF,Crystal Palace,1,2018-08-11T14:00:00Z,Fulham,False,0.0,2.0,12,120.0,,,,1,3,38,14.0,1,0,0,0,6.0,46.0,90,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120624,oliver_skipp,441,2223,MID,Spurs,38,2023-05-28T15:30:00Z,Leeds,False,1.0,4.0,2,43.0,0.01,0.0,1.5,0,0,16,0.0,0,0,1,0,0.0,0.0,90,0.0
120625,ryan_sessegnon,436,2223,DEF,Spurs,38,2023-05-28T15:30:00Z,Leeds,False,1.0,4.0,0,35.0,0.00,0.0,0.0,0,0,0,0.0,0,0,0,0,0.0,0.0,0,0.0
120626,ashley_young,538,2223,DEF,Aston Villa,38,2023-05-28T15:30:00Z,Brighton,True,2.0,1.0,0,87.0,0.00,0.0,0.0,0,0,0,0.0,0,0,0,0,0.0,0.0,0,0.0
120627,jeremy_sarmiento morante,119,2223,MID,Brighton,38,2023-05-28T15:30:00Z,Aston Villa,False,2.0,1.0,0,16.0,0.00,0.0,0.0,0,0,0,0.0,0,0,0,0,0.0,0.0,0,0.0


## feature engineering

In [16]:
# some helpful functions 
def player_team_points(was_home, h_score, a_score):
    """gets the points for a gw for the team a player plays for"""
    
    if was_home:
        if h_score == a_score:
            return 1
        if h_score > a_score:
            return 3
        else:
            return 0
    else:
        if h_score == a_score:
            return 1
        if h_score > a_score:
            return 0
        else:
            return 3
        

def opponent_points(player_team_points):
    """gets the points for a player's opposition team"""
    
    if player_team_points == 1:
        return 1
    if player_team_points == 3:
        return 0
    else:
        return 3

In [17]:
def impute_xg(df):
    """imputes xg values for previouse seasons as FPL only collected this data from 2223 onwards"""
    
    imputer = SimpleImputer(strategy="mean")
    features_to_impute = ["expected_assists", "expected_goals", "expected_goals_conceded"]
    
    for feature in features_to_impute:
        df[feature] = imputer.fit_transform(df[[feature]])
    
    return df

In [18]:
# map funcs to our df
raw_db_copy["player_team_points"] = raw_db_copy.apply(lambda x: player_team_points(x.was_home, x.team_h_score, x.team_a_score), axis=1)
raw_db_copy["opponent_team_points"] = raw_db_copy.apply(lambda x: opponent_points(x.player_team_points), axis=1)

# impute
raw_db_copy = impute_xg(raw_db_copy)

## Save dataframe for modelling

In [19]:
raw_db_copy = raw_db_copy[gw_features]
raw_db_copy.to_csv("data/training_datasets/raw_gw_data.csv", index=False)

In [20]:
raw_db_copy.describe()

Unnamed: 0,season,round,gw_total_points,total_points_last_season,assists,bonus,bps,creativity,clean_sheets,saves,goals_conceded,goals_scored,ict_index,influence,minutes,threat,expected_assists,expected_goals,expected_goals_conceded
count,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0,120629.0
mean,2031.65185,21.122839,1.299853,48.562891,0.03815,0.100979,5.699873,4.479074,0.098873,0.092432,0.48228,0.042295,1.629042,6.796117,31.091504,5.024132,0.016583,0.027582,0.300693
std,142.367168,11.63524,2.467575,50.396641,0.207792,0.475229,9.514795,10.388176,0.298493,0.619706,0.959225,0.224621,2.955313,12.519027,40.111422,13.031007,0.031474,0.056686,0.31154
min,1819.0,1.0,-7.0,-3.0,0.0,0.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1920.0,11.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016583,0.027582,0.300693
50%,2021.0,22.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016583,0.027582,0.300693
75%,2122.0,30.0,2.0,80.0,0.0,0.0,9.0,2.1,0.0,0.0,1.0,0.0,2.3,9.8,90.0,2.0,0.016583,0.027582,0.300693
max,2223.0,47.0,26.0,272.0,4.0,3.0,128.0,136.2,1.0,12.0,9.0,4.0,35.8,163.4,90.0,181.0,1.28527,2.37,5.13
