### Appearance predictor

In FPL, players recieve an extra point if they play over 60 minutes and only recieve points for a clean sheet if they have played over 60 minutes. With this in mind, We create a new variable 'over_60_minutes' that will take the value 1 if the player played over 60 minutes and 0 if they did not. We train a Random Forest Classifier and a Gradient Boosting Classifier for each posiiton with this new variable as the target.

In [1]:
import pandas as pd

# add 'over_60_minutes' column
df = pd.read_csv('data/previous_seasons_dataset.csv')
df['over_60_minutes'] = (df['minutes'] >= 60).astype(int)

Goalkeepers:

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import joblib

GK_data = df[df['position'] == 'GK']
GK_mins_target = GK_data['over_60_minutes']
GK_mins_features = GK_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(GK_mins_features, GK_mins_target, train_size=0.8, test_size=0.2)

# Random Forest Classifier

GK_rf_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=None, min_samples_leaf=1, max_features=None)

cv_scores = cross_val_score(GK_rf_clf, x_train, y_train, cv=5, n_jobs=-1)   # Use cross validation to check for overfitting

GK_rf_clf.fit(x_train, y_train)
y_pred = GK_rf_clf.predict(x_test)

# Evaluate performance metrics
accuracy = accuracy_score(y_test, y_pred)   # correct_predictions/total_predictions
precision = precision_score(y_test, y_pred) # correct_positive_predicted/total_positive_predicted
recall = recall_score(y_test, y_pred)   # correct_positive_predicted/total_posititves_actual
confusion = confusion_matrix(y_test, y_pred)

importances = GK_rf_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': GK_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))


print(f'Random Forest cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Random Forest confusion matrix: {confusion}')
print(f'Random Forest accuracy : {accuracy}, Random Forest precision: {precision}, Random Forest recall: {recall}')
print()
print('-'*100)

# Gradient Boosting Classifier

GK_gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, max_features=None)

cv_scores = cross_val_score(GK_gb_clf, x_train, y_train, cv=5, n_jobs=-1)
GK_gb_clf.fit(x_train, y_train)
y_pred = GK_gb_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)   
confusion = confusion_matrix(y_test, y_pred) 

importances = GK_gb_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': GK_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Gradient Boosting confusion matrix: {confusion}')
print(f'Gradient Boosting accuracy : {accuracy}, Gradient Boosting precision: {precision}, Gradient Boosting recall: {recall}')

                                Feature  Importance
6                        mins_last_game    0.754266
34             total_points_last_season    0.016247
12                  mean_points_last_10    0.013318
11                     mean_mins_last_5    0.012108
2                                 value    0.009340
37      total_team_conceded_last_season    0.009213
35               total_mins_last_season    0.008644
13                    mean_mins_last_10    0.008391
39  total_opponent_conceded_last_season    0.007321
17              mean_team_points_last_5    0.007239
Random Forest cross validation scores: [0.95437666 0.9469496  0.95384615 0.95384615 0.95010616]
mean cross validation scores: 0.9518249449503567
Random Forest confusion matrix: [[1663   65]
 [  55  574]]
Random Forest accuracy : 0.9490878235044549, Random Forest precision: 0.8982785602503912, Random Forest recall: 0.9125596184419714

--------------------------------------------------------------------------------------------

Defenders: 

In [3]:
DEF_data = df[df['position'] == 'DEF']
DEF_mins_target = DEF_data['over_60_minutes']
DEF_mins_features = DEF_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]


x_train, x_test, y_train, y_test = train_test_split(DEF_mins_features, DEF_mins_target, train_size=0.8, test_size=0.2)

# Random Forest Classifier

DEF_rf_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=None, min_samples_leaf=1, max_features=None)

cv_scores = cross_val_score(DEF_rf_clf, x_train, y_train, cv=5, n_jobs=-1)  

DEF_rf_clf.fit(x_train, y_train)
y_pred = DEF_rf_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred) 
recall = recall_score(y_test, y_pred)  
confusion = confusion_matrix(y_test, y_pred) 

importances = DEF_rf_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': DEF_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Random Forest confusion matrix: {confusion}')
print(f'Random Forest accuracy : {accuracy}, Random Forest precision: {precision}, Random Forest recall: {recall}')
print('-'*100)

# Gradient Boosting Classifier

DEF_gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, max_features=None)

cv_scores = cross_val_score(DEF_gb_clf, x_train, y_train, cv=5, n_jobs=-1)
DEF_gb_clf.fit(x_train, y_train)
y_pred = DEF_gb_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)   
confusion = confusion_matrix(y_test, y_pred)

importances = DEF_gb_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': DEF_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Gradient Boosting confusion matrix: {confusion}')
print(f'Gradient Boosting accuracy : {accuracy}, Gradient Boosting precision: {precision}, Gradient Boosting recall: {recall}')

                     Feature  Importance
6             mins_last_game    0.466295
9           mean_mins_last_3    0.056980
35    total_mins_last_season    0.025064
34  total_points_last_season    0.021555
13         mean_mins_last_10    0.020537
24     total_opponent_points    0.015773
14     team_points_last_game    0.014702
7                 total_mins    0.014507
2                      value    0.014166
29   total_opponent_conceded    0.013338
Random Forest cross validation scores: [0.85122247 0.85677128 0.85154353 0.84824835 0.85241068]
mean cross validation scores: 0.8520392648713984
Random Forest confusion matrix: [[4226  570]
 [ 535 1877]]
Random Forest accuracy : 0.8466981132075472, Random Forest precision: 0.7670617082141398, Random Forest recall: 0.7781923714759535
----------------------------------------------------------------------------------------------------
                     Feature  Importance
6             mins_last_game    0.767658
9           mean_mins_last_3   

Midfielders:

In [4]:
MID_data = df[df['position'] == 'MID']
MID_mins_target = MID_data['over_60_minutes']
MID_mins_features = MID_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(MID_mins_features, MID_mins_target, train_size=0.8, test_size=0.2)

# Random Forest Classifier

MID_rf_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=None, min_samples_leaf=1, max_features=None)

cv_scores = cross_val_score(MID_rf_clf, x_train, y_train, cv=5, n_jobs=-1)

MID_rf_clf.fit(x_train, y_train)
y_pred = MID_rf_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)  
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)   
confusion = confusion_matrix(y_test, y_pred)  

importances = MID_rf_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': MID_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Random Forest confusion matrix: {confusion}')
print(f'Random Forest accuracy : {accuracy}, Random Forest precision: {precision}, Random Forest recall: {recall}')
print('-'*100)

# Gradient Boosting Classifier

MID_gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

cv_scores = cross_val_score(MID_gb_clf, x_train, y_train, cv=5, n_jobs=-1)
MID_gb_clf.fit(x_train, y_train)
y_pred = MID_gb_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)   
confusion = confusion_matrix(y_test, y_pred) 

importances = MID_gb_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': MID_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Gradient Boosting confusion matrix: {confusion}')
print(f'Gradient Boosting accuracy : {accuracy}, Gradient Boosting precision: {precision}, Gradient Boosting recall: {recall}')

                     Feature  Importance
6             mins_last_game    0.459096
9           mean_mins_last_3    0.040898
13         mean_mins_last_10    0.028670
35    total_mins_last_season    0.025350
11          mean_mins_last_5    0.024123
34  total_points_last_season    0.019249
12       mean_points_last_10    0.018937
29   total_opponent_conceded    0.016520
2                      value    0.015759
7                 total_mins    0.014343
Random Forest cross validation scores: [0.85370293 0.85216296 0.85594288 0.85634276 0.85032204]
mean cross validation scores: 0.8536947127201394
Random Forest confusion matrix: [[5681  658]
 [ 602 1988]]
Random Forest accuracy : 0.8588867734348751, Random Forest precision: 0.7513227513227513, Random Forest recall: 0.7675675675675676
----------------------------------------------------------------------------------------------------
                    Feature  Importance
6            mins_last_game    0.644306
9          mean_mins_last_3    0.

Forwards:

In [5]:
FWD_data = df[df['position'] == 'FWD']

FWD_mins_target = FWD_data['over_60_minutes']
FWD_mins_features = FWD_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(FWD_mins_features, FWD_mins_target, train_size=0.8, test_size=0.2)

# Random Forest Classifier

FWD_rf_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, max_depth=None, min_samples_leaf=1, max_features=None)

cv_scores = cross_val_score(FWD_rf_clf, x_train, y_train, cv=5, n_jobs=-1)

FWD_rf_clf.fit(x_train, y_train)
y_pred = FWD_rf_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred) 
confusion = confusion_matrix(y_test, y_pred) 

importances = FWD_rf_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': FWD_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Random Forest confusion matrix: {confusion}')
print(f'Random Forest accuracy : {accuracy}, Random Forest precision: {precision}, Random Forest recall: {recall}')
print('-'*100)

# Gradient Boosting Classifier

FWD_gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

cv_scores = cross_val_score(FWD_gb_clf, x_train, y_train, cv=5, n_jobs=-1)
FWD_gb_clf.fit(x_train, y_train)
y_pred = FWD_gb_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)   
confusion = confusion_matrix(y_test, y_pred) 

importances = FWD_gb_clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': FWD_mins_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {cv_scores}')
print(f'mean cross validation scores: {np.mean(cv_scores)}')
print(f'Gradient Boosting confusion matrix: {confusion}')
print(f'Gradient Boosting accuracy : {accuracy}, Gradient Boosting precision: {precision}, Gradient Boosting recall: {recall}')

                     Feature  Importance
6             mins_last_game    0.477035
9           mean_mins_last_3    0.033340
12       mean_points_last_10    0.022842
13         mean_mins_last_10    0.021418
35    total_mins_last_season    0.020156
11          mean_mins_last_5    0.019310
34  total_points_last_season    0.016050
19   team_conceded_last_game    0.015976
2                      value    0.014695
7                 total_mins    0.014620
Random Forest cross validation scores: [0.86323598 0.86787204 0.86270872 0.86595547 0.85482375]
mean cross validation scores: 0.8629191922058999
Random Forest confusion matrix: [[1843  167]
 [ 186  500]]
Random Forest accuracy : 0.8690652818991098, Random Forest precision: 0.7496251874062968, Random Forest recall: 0.7288629737609329
----------------------------------------------------------------------------------------------------
                          Feature  Importance
6                  mins_last_game    0.744003
9                mean

I varied the hyperparameters for both models (for each position) but found little difference in performance. The two models perform similarly for each position. For simplicity, I decided to use Gradient Boosting Classifier for all positions. 

In [6]:
import joblib

joblib.dump(GK_gb_clf, 'models/GK_appearance_classifier.pkl')
joblib.dump(DEF_gb_clf, 'models/DEF_appearance_classifier.pkl')
joblib.dump(MID_gb_clf, 'models/MID_appearance_classifier.pkl')
joblib.dump(FWD_gb_clf, 'models/FWD_appearance_classifier.pkl')

['FWD_appearance_classifier.pkl']