In [1]:
from FPLClient import FPLClient, exponential_decay
import pandas as pd
import numpy as np
from parquet.dropper import from_parquet, to_parquet

In [3]:
client = FPLClient()

In [None]:
# Load Players Base Data
player_list = pd.DataFrame(client.general.all()['elements'])['id'].to_list()
dataset = pd.DataFrame()
for player in player_list:
    _player = pd.DataFrame(client.details.all(player)['history'])
    if dataset.__len__() == 0:
        dataset = _player
    else:
        dataset = pd.concat([dataset, _player], ignore_index=True)

# Remove not need columns
dataset.drop(columns=['modified'], inplace=True)

# Save To Parquet
to_parquet(dataset, 'playersStats')

In [4]:
# Load Parquet Base File
dataset = from_parquet('playersStats')
dataset.head()

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,minutes,...,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,value,transfers_balance,selected,transfers_in,transfers_out
0,1,2,20,0,True,2024-08-17T14:00:00Z,2,0,1,0,...,0,0.0,0.0,0.0,0.0,55,0,2923,0,0
1,1,11,2,0,False,2024-08-24T16:30:00Z,0,2,2,0,...,0,0.0,0.0,0.0,0.0,55,-790,2321,84,874
2,1,21,5,0,True,2024-08-31T11:30:00Z,1,1,3,0,...,0,0.0,0.0,0.0,0.0,54,-279,2397,355,634
3,1,39,18,0,False,2024-09-15T13:00:00Z,0,1,4,0,...,0,0.0,0.0,0.0,0.0,54,-747,1650,0,747
4,1,47,13,0,False,2024-09-22T15:30:00Z,2,2,5,0,...,0,0.0,0.0,0.0,0.0,54,-174,1494,0,174


In [5]:
# Players above minimum playing Threshold
minimum_minutes_percentage = 0.20
fixtures = client.general.all()['events']
fixtures_played = sum([int(fixture['finished']) for fixture in fixtures])
total_minutes = fixtures_played * 90
player_summary = pd.DataFrame(client.general.all()['elements'])[['id', 'minutes']]
player_summary['minutes_threshold'] = player_summary['minutes'] > total_minutes * minimum_minutes_percentage
valid_players = player_summary[player_summary['minutes_threshold']]['id'].to_list()

In [6]:
dataset_filtered = dataset[dataset['element'].isin(valid_players)]
dataset_filtered

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,minutes,...,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,value,transfers_balance,selected,transfers_in,transfers_out
20,2,2,20,0,True,2024-08-17T14:00:00Z,2,0,1,5,...,0,0.00,0.00,0.00,0.15,70,0,199810,0,0
21,2,11,2,0,False,2024-08-24T16:30:00Z,0,2,2,0,...,0,0.00,0.00,0.00,0.00,69,-53975,176166,12170,66145
22,2,21,5,0,True,2024-08-31T11:30:00Z,1,1,3,0,...,0,0.00,0.00,0.00,0.00,68,-72583,106691,3238,75821
23,2,39,18,1,False,2024-09-15T13:00:00Z,0,1,4,10,...,0,0.00,0.00,0.00,0.13,68,-27513,82577,1997,29510
24,2,47,13,0,False,2024-09-22T15:30:00Z,2,2,5,3,...,0,0.00,0.00,0.00,0.69,68,1642,89599,7682,6040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13407,642,160,10,1,True,2024-12-14T15:00:00Z,1,2,16,45,...,1,0.00,0.01,0.01,0.57,50,279,2888,515,236
13408,642,167,11,3,False,2024-12-22T14:00:00Z,0,3,17,90,...,1,0.00,0.01,0.01,0.81,50,-308,2621,99,407
13409,642,180,14,3,True,2024-12-26T17:30:00Z,2,0,18,90,...,1,0.00,0.02,0.02,0.44,50,-47,2657,155,202
13410,642,189,18,2,False,2024-12-29T15:00:00Z,2,2,19,68,...,1,0.00,0.01,0.01,1.35,50,131,2851,287,156


In [7]:
dataset_p2 = dataset_filtered[['element','was_home','round','minutes','total_points','opponent_team']].copy()
dataset_p2['was_home'] = dataset_p2['was_home'].astype(int)
dataset_p2

Unnamed: 0,element,was_home,round,minutes,total_points,opponent_team
20,2,1,1,5,0,20
21,2,0,2,0,0,2
22,2,1,3,0,0,5
23,2,0,4,10,1,18
24,2,0,5,3,0,13
...,...,...,...,...,...,...
13407,642,1,16,45,1,10
13408,642,0,17,90,3,11
13409,642,1,18,90,3,14
13410,642,0,19,68,2,18


In [8]:
fixture_decay = exponential_decay(dataset_p2['round'], 0.1)
dataset_p2['decay'] = fixture_decay
dataset_p2


Unnamed: 0,element,was_home,round,minutes,total_points,opponent_team,decay
20,2,1,1,5,0,20,0.135335
21,2,0,2,0,0,2,0.149569
22,2,1,3,0,0,5,0.165299
23,2,0,4,10,1,18,0.182684
24,2,0,5,3,0,13,0.201897
...,...,...,...,...,...,...,...
13407,642,1,16,45,1,10,0.606531
13408,642,0,17,90,3,11,0.670320
13409,642,1,18,90,3,14,0.740818
13410,642,0,19,68,2,18,0.818731


In [17]:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings

In [10]:
X_train,X_test,y_train,y_test = train_test_split(dataset_p2.drop(columns=['total_points']),dataset_p2['total_points'],test_size=0.2)
X_train

Unnamed: 0,element,was_home,round,minutes,opponent_team,decay
2298,101,1,1,90,7,0.135335
6461,274,0,11,90,18,0.367879
8494,361,1,16,90,14,0.606531
4640,205,1,5,90,14,0.201897
922,42,1,9,8,3,0.301194
...,...,...,...,...,...,...
12691,204,1,16,90,10,0.606531
9694,410,0,17,26,10,0.670320
11732,495,1,11,90,10,0.367879
5679,240,0,16,68,12,0.606531


In [None]:
ct = ColumnTransformer(
    transformers= [
        ('player_encoded', OneHotEncoder(), ['element'])
        ],
    remainder = 'passthrough'
)

ct.fit_transform(X_train).shape

(5244, 336)

In [18]:
# Define the hyperparameters and their search ranges
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],  # Only 'l2' penalty is compatible with 'lbfgs' solver
    'solver': ['liblinear', 'lbfgs']
    
}

# Create a Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000)


grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Fit the model with the best hyperparameters on the entire dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
accuracy = best_model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {best_params}
Accuracy on test set: {accuracy:.2f}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print("Best Hyperparameters: {}".format(best_params))
print("Accuracy on test set: {}".format(accuracy))

Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy on test set: 0.4771341463414634
