In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn

In [2]:
base_path = os.getcwd()
base_path

'/Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/predictor'

In [3]:
file_path = os.path.join(base_path, "data/cleaned_merged_seasons.csv")
df = pd.read_csv("/Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/cleaned_merged_seasons.csv")

  df = pd.read_csv("/Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/cleaned_merged_seasons.csv")


In [4]:
df.columns

Index(['season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW'],
      dtype='object')

# PREPARE DATA

In [5]:
df['season_x'].unique()

array(['2016-17', '2017-18', '2020-21', '2021-22', '2022-23', '2023-24'],
      dtype=object)

In [6]:
for i in df.columns:
    print(f"{i}: {df[i].isna().sum()}")

season_x: 0
name: 0
position: 0
team_x: 20034
assists: 0
bonus: 0
bps: 0
clean_sheets: 0
creativity: 0
element: 0
fixture: 0
goals_conceded: 0
goals_scored: 0
ict_index: 0
influence: 0
kickoff_time: 0
minutes: 0
opponent_team: 0
opp_team_name: 126076
own_goals: 0
penalties_missed: 0
penalties_saved: 0
red_cards: 0
round: 0
saves: 0
selected: 0
team_a_score: 0
team_h_score: 0
threat: 0
total_points: 0
transfers_balance: 0
transfers_in: 0
transfers_out: 0
value: 0
was_home: 0
yellow_cards: 0
GW: 0


In [7]:
# Assuming your merged DataFrame is called df_merged and has columns 'season_x' and 'position'
for season in sorted(df['season_x'].unique()):
    season_mask = df['season_x'] == season
    n_total = season_mask.sum()
    n_nans = df.loc[season_mask, 'position'].isna().sum()
    percent_nans = (n_nans / n_total) * 100 if n_total > 0 else 0
    print(f"{season}: {percent_nans:.2f}% NaN values in position column")

2016-17: 0.00% NaN values in position column
2017-18: 0.00% NaN values in position column
2020-21: 0.00% NaN values in position column
2021-22: 0.00% NaN values in position column
2022-23: 0.00% NaN values in position column
2023-24: 0.00% NaN values in position column


In [8]:
# Load a fixture file to see the difficulty ratings
import os

# Check if fixture files exist and examine structure
fixture_file = "/Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/2023-24/fixtures.csv"
if os.path.exists(fixture_file):
    fixtures_df = pd.read_csv(fixture_file)
    print("Fixtures file columns:")
    print(fixtures_df.columns.tolist())
    print("\nSample fixture difficulty data:")
    print(fixtures_df[['id', 'team_h', 'team_a', 'team_h_difficulty', 'team_a_difficulty']].head())
    print(f"\nDifficulty ratings range: {fixtures_df['team_h_difficulty'].min()} to {fixtures_df['team_h_difficulty'].max()}")
else:
    print("Fixture file not found")

Fixtures file columns:
['code', 'event', 'finished', 'finished_provisional', 'id', 'kickoff_time', 'minutes', 'provisional_start_time', 'started', 'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats', 'team_h_difficulty', 'team_a_difficulty', 'pulse_id']

Sample fixture difficulty data:
   id  team_h  team_a  team_h_difficulty  team_a_difficulty
0   1       6      13                  5                  2
1   2       1      16                  2                  5
2   3       3      19                  2                  2
3   4       5      12                  2                  3
4   5       9      10                  2                  2

Difficulty ratings range: 2 to 5


In [9]:
def add_fixture_difficulty_to_dataframe(df):
    """
    Add fixture difficulty ratings to the main dataframe by merging with fixture files
    from each season directory.
    """
    base_path = "/Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/"
    
    # Get all seasons from the dataframe
    seasons = df['season_x'].unique()
    print(f"Found seasons in data: {sorted(seasons)}")
    
    # Store all fixture data
    all_fixtures = []
    
    for season in seasons:
        fixture_file = os.path.join(base_path, season, "fixtures.csv")
        
        if os.path.exists(fixture_file):
            try:
                season_fixtures = pd.read_csv(fixture_file)
                
                # Add season identifier
                season_fixtures['fixture_season'] = season
                
                # Select relevant columns and rename to avoid conflicts
                season_fixtures = season_fixtures[['id', 'team_h_difficulty', 'team_a_difficulty', 'fixture_season']]
                season_fixtures = season_fixtures.rename(columns={'id': 'fixture_id'})
                
                all_fixtures.append(season_fixtures)
                print(f"✓ Loaded {len(season_fixtures)} fixtures from {season}")
                
            except Exception as e:
                print(f"✗ Error loading fixtures from {season}: {e}")
        else:
            print(f"✗ Fixture file not found for {season}: {fixture_file}")
    
    if not all_fixtures:
        print("No fixture files found!")
        return df
    
    # Combine all fixture data
    combined_fixtures = pd.concat(all_fixtures, ignore_index=True)
    print(f"\nTotal fixtures loaded: {len(combined_fixtures)}")
    
    # Merge with main dataframe
    # The 'fixture' column in df corresponds to 'fixture_id' in fixtures
    df_with_difficulty = df.merge(
        combined_fixtures,
        left_on=['fixture', 'season_x'],
        right_on=['fixture_id', 'fixture_season'],
        how='left'
    )
    
    # Create player-specific difficulty rating
    # If player was home team, use team_h_difficulty, else use team_a_difficulty
    df_with_difficulty['fixture_difficulty'] = df_with_difficulty.apply(
        lambda row: row['team_h_difficulty'] if row['was_home'] else row['team_a_difficulty'],
        axis=1
    )
    
    # Drop temporary columns that were created during merge
    columns_to_drop = ['fixture_id', 'fixture_season']
    existing_columns_to_drop = [col for col in columns_to_drop if col in df_with_difficulty.columns]
    
    if existing_columns_to_drop:
        df_with_difficulty = df_with_difficulty.drop(existing_columns_to_drop, axis=1)
    
    print(f"\nMerge results:")
    print(f"Original dataframe: {len(df)} rows")
    print(f"With difficulty: {len(df_with_difficulty)} rows")
    print(f"Missing difficulty values: {df_with_difficulty['fixture_difficulty'].isna().sum()}")
    
    return df_with_difficulty

# Apply the function to add fixture difficulty
print("Adding fixture difficulty ratings...")
df_enhanced = add_fixture_difficulty_to_dataframe(df)

# Check the results
print("\nFixture difficulty distribution:")
print(df_enhanced['fixture_difficulty'].value_counts().sort_index())

print("\nSample data with difficulty ratings:")
sample_with_difficulty = df_enhanced[['name', 'GW', 'season_x', 'was_home', 'opp_team_name', 
                                     'team_h_difficulty', 'team_a_difficulty', 'fixture_difficulty']].head(10)
print(sample_with_difficulty)

Adding fixture difficulty ratings...
Found seasons in data: ['2016-17', '2017-18', '2020-21', '2021-22', '2022-23', '2023-24']
✗ Fixture file not found for 2016-17: /Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/2016-17/fixtures.csv
✗ Fixture file not found for 2017-18: /Users/bragehs/Documents/FPL helper/Fantasy-Premier-League/data/2017-18/fixtures.csv
✓ Loaded 380 fixtures from 2020-21
✓ Loaded 380 fixtures from 2021-22
✓ Loaded 380 fixtures from 2022-23
✓ Loaded 380 fixtures from 2023-24

Total fixtures loaded: 1520

Merge results:
Original dataframe: 126076 rows
With difficulty: 126076 rows
Missing difficulty values: 20034

Fixture difficulty distribution:
fixture_difficulty
2.0    50664
3.0    26393
4.0    21365
5.0     7620
Name: count, dtype: int64

Sample data with difficulty ratings:
                             name   GW season_x  was_home  opp_team_name  \
0                 Aaron Cresswell  1.0  2016-17     False            NaN   
1                    Aaron 

In [10]:
df_enhanced.head()

Unnamed: 0,season_x,name,position,team_x,assists,bonus,bps,clean_sheets,creativity,element,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,team_h_difficulty,team_a_difficulty,fixture_difficulty
0,2016-17,Aaron Cresswell,DEF,,0.0,0.0,0.0,0.0,0.0,454.0,...,0.0,0.0,0.0,55.0,False,0.0,1.0,,,
1,2016-17,Aaron Lennon,MID,,0.0,0.0,6.0,0.0,0.3,142.0,...,0.0,0.0,0.0,60.0,True,0.0,1.0,,,
2,2016-17,Aaron Ramsey,MID,,0.0,0.0,5.0,0.0,4.9,16.0,...,0.0,0.0,0.0,80.0,True,0.0,1.0,,,
3,2016-17,Abdoulaye Doucouré,MID,,0.0,0.0,0.0,0.0,0.0,482.0,...,0.0,0.0,0.0,50.0,False,0.0,1.0,,,
4,2016-17,Abdul Rahman Baba,DEF,,0.0,0.0,0.0,0.0,0.0,80.0,...,0.0,0.0,0.0,55.0,True,0.0,1.0,,,


In [11]:
print(len(df_enhanced))

126076


In [12]:
remove_season = df_enhanced[~df_enhanced['season_x'].isin(['2017-18', '2016-17'])].copy()
print(len(remove_season))
testing = remove_season[["name", "opp_team_name", "team_h_difficulty", "team_a_difficulty", "fixture_difficulty"]].head(20)
testing

106042


Unnamed: 0,name,opp_team_name,team_h_difficulty,team_a_difficulty,fixture_difficulty
20034,Aaron Connolly,,4.0,3.0,4.0
20035,Aaron Cresswell,,2.0,4.0,2.0
20036,Aaron Mooy,,4.0,3.0,4.0
20037,Aaron Ramsdale,,2.0,2.0,2.0
20038,Abdoulaye Doucouré,,4.0,4.0,4.0
20039,Aboubakar Kamara,,3.0,2.0,3.0
20040,Adama Traoré,,2.0,2.0,2.0
20041,Adam Forshaw,,3.0,3.0,3.0
20042,Adam Lallana,,4.0,3.0,4.0
20043,Adam Webster,,4.0,3.0,4.0


In [13]:
remove_season.columns

Index(['season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'team_h_difficulty',
       'team_a_difficulty', 'fixture_difficulty'],
      dtype='object')

In [14]:
for i in remove_season.columns:
    print(f"{i}: {remove_season[i].isna().sum()}")

season_x: 0
name: 0
position: 0
team_x: 0
assists: 0
bonus: 0


bps: 0
clean_sheets: 0
creativity: 0
element: 0
fixture: 0
goals_conceded: 0
goals_scored: 0
ict_index: 0
influence: 0
kickoff_time: 0
minutes: 0
opponent_team: 0
opp_team_name: 106042
own_goals: 0
penalties_missed: 0
penalties_saved: 0
red_cards: 0
round: 0
saves: 0
selected: 0
team_a_score: 0
team_h_score: 0
threat: 0
total_points: 0
transfers_balance: 0
transfers_in: 0
transfers_out: 0
value: 0
was_home: 0
yellow_cards: 0
GW: 0
team_h_difficulty: 0
team_a_difficulty: 0
fixture_difficulty: 0


In [15]:
print(remove_season['GW'].unique())
print(remove_season['round'].unique())

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36.
 37. 38.]
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36.
 37. 38.]


In [16]:
remove_season['opponent_team'].unique()

array([ 5., 14., 20., 17.,  1., 15., 11., 18., 10.,  9.,  8.,  6., 19.,
        3., 16.,  7.,  2., 12.,  4., 13.])

In [17]:
remove_season.columns

Index(['season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'team_h_difficulty',
       'team_a_difficulty', 'fixture_difficulty'],
      dtype='object')

In [32]:
categorical_features = [
    'position',           # Player position (GK, DEF, MID, FWD)
]

binary_features = [
    'was_home',          # Home/Away (boolean)
]

min_max_features = [
    'bonus', 'minutes','fixture_difficulty', 'next_fixture_difficulty', 'yellow_cards', 'red_cards', 
    'own_goals', 'saves', 'goals_conceded','clean_sheets', 'penalties_missed',
    'penalties_saved', 'assists', 'goals_scored', 'season_progress',

]

standard_features = [
    'team_a_score', 'team_h_score', 'creativity', 'influence', 'threat', 'ict_index',  'bps', 
]

metadata_features = [
    'name',                    # Player name (use for grouping/analysis)
    'element',                 # Player ID (use for grouping sequences)
    'team_x',                 
    'season_x', 
]
time_feature = 'round'
target = 'total_points'  # Target variable for prediction

In [33]:
remove_season[remove_season['name'] == 'Mohamed Salah']['element'].unique()

array([254., 233., 283., 308.])

In [34]:
remove_season['season_x'].unique()

array(['2020-21', '2021-22', '2022-23', '2023-24'], dtype=object)

In [35]:
remove_season.GW.unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.])

In [36]:
remove_season.loc[remove_season['position'].isin(['GK', 'GKP']), 'position'] = 'GK'

In [37]:
remove_season.loc[remove_season['fixture_difficulty'] == 1, 'fixture_difficulty'] = 2

In [38]:
remove_season['position'].unique()

array(['FWD', 'DEF', 'GK', 'MID'], dtype=object)

In [39]:
remove_season['fixture_difficulty'].unique()

array([4., 2., 3., 5.])

In [40]:
#remove rows where minutes is 0
remove_season = remove_season[remove_season['minutes'] > 0].copy()

In [41]:
#some minor feature engineering
remove_season['season_progress'] = remove_season['GW'] / remove_season['GW'].max()
remove_season['next_fixture_difficulty'] = (
    remove_season
    .groupby(['element', 'season_x'])['fixture_difficulty']
    .shift(-1)
)

In [42]:
# Assume df is your DataFrame with 'season_x' and 'GW' columns
# Sort by season and GW
remove_season = remove_season.sort_values(['season_x', 'GW']).reset_index(drop=True)

In [43]:
remove_season.iloc[-1]

season_x                                2023-24
name                                Kyle Walker
position                                    DEF
team_x                                 Man City
assists                                     0.0
bonus                                       0.0
bps                                        15.0
clean_sheets                                0.0
creativity                                 11.6
element                                   369.0
fixture                                   379.0
goals_conceded                              1.0
goals_scored                                0.0
ict_index                                   3.2
influence                                  20.6
kickoff_time               2024-05-19T15:00:00Z
minutes                                    90.0
opponent_team                              19.0
opp_team_name                               NaN
own_goals                                   0.0
penalties_missed                        

In [44]:
# Calculate split indices
n = len(remove_season)
train_end = int(0.8 * n)

train_df = remove_season.iloc[:train_end:]
val_df = remove_season.iloc[train_end:]

print(f"Train: {len(train_df)}, Val: {len(val_df)}")

Train: 34885, Val: 8722


In [45]:
def preprocess_data(df, scalers=None, encoders=None, fit=False):
    """
    Preprocess the dataframe by scaling continuous features and encoding categorical features.
    
    Args:
        df: DataFrame to process
        scalers: Dictionary of fitted scalers (if fit=False)
        encoders: Dictionary of fitted encoders (if fit=False)
        fit: Whether to fit new scalers/encoders or use existing ones
    
    Returns:
        X_processed: Processed DataFrame
        scalers: Dictionary of scalers (if fit=True)
        encoders: Dictionary of encoders (if fit=True)
    """
    # Separate features
    X_min_max = df[min_max_features]
    X_standard = df[standard_features]
    X_categorical = df[categorical_features]
    X_metadata = df[metadata_features]

    
    if fit:
        # Fit new scalers and encoders
        scalers = {}
        encoders = {}
        
        # Scale continuous features
        scalers['continuous'] = MinMaxScaler()
        X_minmax_scaled = scalers['continuous'].fit_transform(X_min_max)
        
        # Encode categorical features
        encoders['categorical'] = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        X_categorical_encoded = encoders['categorical'].fit_transform(X_categorical)
        
        # Scale ordinal features
        scalers['ordinal'] = StandardScaler()
        X_standard_encoded = scalers['ordinal'].fit_transform(X_standard)
    else:
        # Use existing scalers and encoders
        X_minmax_scaled = scalers['continuous'].transform(X_min_max)
        X_categorical_encoded = encoders['categorical'].transform(X_categorical)
        X_standard_encoded = scalers['ordinal'].transform(X_standard)
    
    # Extract time and target
    X_time = df['GW']
    y = df[target]
    
    # Combine processed features
    X_time_df = pd.DataFrame(X_time.values, columns=['GW'])
    X_cont_df = pd.DataFrame(X_minmax_scaled, columns=min_max_features)
    X_cat_df = pd.DataFrame(X_categorical_encoded, 
                           columns=encoders['categorical'].get_feature_names_out(categorical_features))
    X_metadata_df = pd.DataFrame(X_metadata.values, columns=metadata_features)
    X_ordinal_encoded_df = pd.DataFrame(X_standard_encoded, columns=standard_features)
    target_df = pd.DataFrame(y.values, columns=[target])
    
    X_processed = pd.concat([X_time_df, X_cont_df, X_cat_df, X_metadata_df, 
                            X_ordinal_encoded_df, target_df], axis=1)
    
    if fit:
        return X_processed, scalers, encoders
    else:
        return X_processed

# Fit scalers and encoders on training data only
processed_train_df, scalers, encoders = preprocess_data(train_df, fit=True)

# Transform validation and test data using the fitted scalers/encoders
processed_val_df = preprocess_data(val_df, scalers=scalers, encoders=encoders, fit=False)

In [46]:
for i in processed_train_df.columns:
    print(f"{i}: {processed_val_df[i].isna().sum()}")

GW: 0
bonus: 0
minutes: 0
fixture_difficulty: 0
next_fixture_difficulty: 544
yellow_cards: 0
red_cards: 0
own_goals: 0
saves: 0
goals_conceded: 0
clean_sheets: 0
penalties_missed: 0
penalties_saved: 0
assists: 0
goals_scored: 0
season_progress: 0
position_DEF: 0
position_FWD: 0
position_GK: 0
position_MID: 0
name: 0
element: 0
team_x: 0
season_x: 0
team_a_score: 0
team_h_score: 0
creativity: 0
influence: 0
threat: 0
ict_index: 0
bps: 0
total_points: 0


In [47]:
processed_train_df.columns

Index(['GW', 'bonus', 'minutes', 'fixture_difficulty',
       'next_fixture_difficulty', 'yellow_cards', 'red_cards', 'own_goals',
       'saves', 'goals_conceded', 'clean_sheets', 'penalties_missed',
       'penalties_saved', 'assists', 'goals_scored', 'season_progress',
       'position_DEF', 'position_FWD', 'position_GK', 'position_MID', 'name',
       'element', 'team_x', 'season_x', 'team_a_score', 'team_h_score',
       'creativity', 'influence', 'threat', 'ict_index', 'bps',
       'total_points'],
      dtype='object')

In [48]:
processed_train_df.head()

Unnamed: 0,GW,bonus,minutes,fixture_difficulty,next_fixture_difficulty,yellow_cards,red_cards,own_goals,saves,goals_conceded,...,team_x,season_x,team_a_score,team_h_score,creativity,influence,threat,ict_index,bps,total_points
0,1.0,0.0,0.494382,0.666667,0.0,0.0,0.0,0.0,0.0,0.222222,...,Brighton,2020-21,1.369206,-0.382483,-0.712655,-0.924288,1.239238,-0.049563,-1.48636,1.0
1,1.0,0.0,1.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.222222,...,West Ham,2020-21,0.564015,-1.114806,0.093308,-0.303254,-0.639617,-0.398635,-0.161281,1.0
2,1.0,0.0,1.0,0.0,0.333333,0.0,0.0,0.0,0.181818,0.222222,...,Sheffield Utd,2020-21,0.564015,-1.114806,-0.734838,0.223274,-0.639617,-0.514992,-0.066632,1.0
3,1.0,0.0,1.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,...,Everton,2020-21,-0.241177,-1.114806,2.562958,0.371782,-0.40476,0.968563,0.690556,3.0
4,1.0,0.0,0.685393,0.333333,0.333333,0.0,0.0,0.0,0.0,0.333333,...,Fulham,2020-21,1.369206,-1.114806,-0.601743,-1.005292,-0.346046,-0.980422,-1.391712,2.0


In [49]:
print(processed_train_df['season_x'].unique())
print(processed_val_df['season_x'].unique())

['2020-21' '2021-22' '2022-23' '2023-24']
['2023-24']


In [50]:
for season, group in processed_train_df.groupby('season_x'):
    for i in processed_train_df['element'].unique():
        unique_names = group[group['element'] == i]['name'].unique()
        if len(unique_names) > 1:
            print(f"{season}: {unique_names}")

#element is only a player ID within a season

2023-24: ['Michale Olakigbe' 'Michael Olakigbe']


In [51]:
print(len(processed_train_df))

34885


In [52]:
processed_train_df.columns

Index(['GW', 'bonus', 'minutes', 'fixture_difficulty',
       'next_fixture_difficulty', 'yellow_cards', 'red_cards', 'own_goals',
       'saves', 'goals_conceded', 'clean_sheets', 'penalties_missed',
       'penalties_saved', 'assists', 'goals_scored', 'season_progress',
       'position_DEF', 'position_FWD', 'position_GK', 'position_MID', 'name',
       'element', 'team_x', 'season_x', 'team_a_score', 'team_h_score',
       'creativity', 'influence', 'threat', 'ict_index', 'bps',
       'total_points'],
      dtype='object')

In [53]:
processed_train_df['total_points'].describe()

count    34885.000000
mean         2.916124
std          2.987929
min         -7.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         24.000000
Name: total_points, dtype: float64

In [61]:
def create_sequences(df, past_sequences=5, future_sequences=3):
    """
    Create sequences of data for each player based on the Gameweek (GW).
    all combinations of previous 5 GWs, with the next GW as target.
    store as pytorch tensors.
    """
    #sort to ensure features are always in same order
    feature_cols = sorted([col for col in df.columns if col not in ['total_points', 'GW', 'element', 'name', 'season_x', 'team_x']])
    X_seq, y_seq, meta, future_fixture_difficulty = [], [], [], []

    for player_id, stats in df.groupby(['element', 'season_x']):
        group = stats.sort_values('GW')
        
        for i in range(len(group) - past_sequences):
            sequence = group.iloc[i:i+past_sequences][feature_cols].values
            target = group.iloc[i+past_sequences:i+past_sequences+future_sequences]['total_points'].values
            future_fd = group.iloc[i+past_sequences:i+past_sequences+future_sequences]['fixture_difficulty'].values
            if len(target) != future_sequences:
                continue
            metadata = group.iloc[i:i+past_sequences][['GW', 'element', 'name', 'season_x', 'team_x']].values
            X_seq.append(sequence)
            y_seq.append(target)
            meta.append((player_id, metadata))
            future_fixture_difficulty.append(future_fd)
    
    # Convert to numpy arrays
    X_np = np.array(X_seq)
    y_np = np.array(y_seq)
    future_fd_np = np.array(future_fixture_difficulty)

    # Convert to tensors
    X_tensor = torch.tensor(X_np, dtype=torch.float32)
    y_tensor = torch.tensor(y_np, dtype=torch.float32)
    future_fd_tensor = torch.tensor(future_fd_np, dtype=torch.float32)

    return X_tensor, y_tensor, future_fd_tensor, meta
        
    

In [62]:
X_train, y_train, future_fd_train, meta_train = create_sequences(processed_train_df)
X_val, y_val, future_fd_val, meta_val = create_sequences(processed_val_df)

print(f"Train sequences: {X_train.shape}, Targets: {y_train.shape}, Future FD: {future_fd_train.shape}")
print(f"Val sequences: {X_val.shape}, Targets: {y_val.shape}, Future FD: {future_fd_val.shape}")

Train sequences: torch.Size([22570, 5, 26]), Targets: torch.Size([22570, 3]), Future FD: torch.Size([22570, 3])
Val sequences: torch.Size([5364, 5, 26]), Targets: torch.Size([5364, 3]), Future FD: torch.Size([5364, 3])


In [63]:
torch.save((X_train, y_train, future_fd_train, meta_train), 'data/train_sequences.pt')
torch.save((X_val, y_val, future_fd_train, meta_val), 'data/val_sequences.pt')

print("Data preparation complete. Sequences saved to disk.")

Data preparation complete. Sequences saved to disk.
