#Predict Football Player Performance Using Random Forest

In [21]:
import numpy as np
import sqlite3
import pandas as pd
pd.options.display.max_rows = 5
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [22]:
connection = sqlite3.connect('/Users/luzi/Desktop/doppEx2/database.sqlite')

players_df = pd.read_sql_query("SELECT * FROM Player", connection)
stats_df = pd.read_sql_query("SELECT * FROM Player_Attributes", connection)

In [23]:
players_df

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
...,...,...,...,...,...,...,...
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06 00:00:00,185.42,172
11059,11075,39902,Zvjezdan Misimovic,102359,1982-06-05 00:00:00,180.34,176


In [25]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [36]:
def preprocess_inputs(players, stats):
    players = players.copy()
    stats = stats.copy()
    
    # Drop unused columns
    players = players.drop(['id', 'player_name', 'player_fifa_api_id'], axis=1)
    stats = stats.drop(['id', 'player_fifa_api_id', 'date'], axis=1)
    
    # Extract birthday date features
    players['birthday'] = pd.to_datetime(players['birthday'])
    players['birth_year'] = players['birthday'].apply(lambda x: x.year)
    players['birth_month'] = players['birthday'].apply(lambda x: x.month)
    players['birth_day'] = players['birthday'].apply(lambda x: x.day)
    players = players.drop('birthday', axis=1)
    
    categoricals = stats.groupby(by='player_api_id', as_index=False)[[
        'player_api_id',
        'preferred_foot',
        'attacking_work_rate',
        'defensive_work_rate'
    ]].head(1)
    
    # Clean categorical columns
    for column in ['attacking_work_rate', 'defensive_work_rate']:
        categoricals[column] = categoricals[column].apply(lambda x: np.NaN if x not in ['low', 'medium', 'high'] else x)
        categoricals[column] = categoricals[column].fillna(categoricals[column].mode()[0])
    
    # Take the average numeric stats within groups and merge with categorical columns
    stats = stats.groupby(by='player_api_id').mean()
    stats = stats.merge(categoricals, on='player_api_id')
    
    # Fill numeric missing values with column means
    for column in stats.loc[:, stats.isna().sum() > 0].columns:
        stats[column] = stats[column].fillna(stats[column].mean())
    
    ## Merge
    
    # Create a single df
    df = players.merge(stats, on='player_api_id')
    df = df.drop('player_api_id', axis=1)
    
    # Binary encoding
    df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
    
    # One-hot encoding
    for column in ['attacking_work_rate', 'defensive_work_rate']:
        df = onehot_encode(df, column=column)
    
    # Split df into X and y
    y = df['overall_rating']
    X = df.drop('overall_rating', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [37]:
X_train, X_test, y_train, y_test = preprocess_inputs(players_df, stats_df)

In [33]:
X_train

Unnamed: 0,height,weight,birth_year,birth_month,birth_day,potential,crossing,finishing,heading_accuracy,short_passing,...,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate_high,attacking_work_rate_low,attacking_work_rate_medium,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium
5429,0.149907,-0.033384,0.076129,0.306169,1.310542,0.035981,-2.226408,-1.761449,-2.491015,-2.570139,...,2.500292,2.649907,3.479833,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
5874,0.944722,1.230136,0.259888,-0.858476,-0.392766,1.308621,0.061943,1.711121,1.384466,0.416901,...,-0.231833,-0.429216,-0.042825,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,3.092040,-1.79019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,-1.042316,-1.695910,-1.577704,1.470814,-0.165659,0.772848,1.198443,1.294633,0.736790,0.455622,...,1.545649,0.127334,0.035040,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
235,0.944722,0.099618,-1.945222,1.470814,1.424096,0.699593,0.205221,-1.002945,1.217392,0.520157,...,1.014664,-0.129172,0.008428,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860


In [46]:
y_train

5429    68.450000
5874    75.433333
          ...    
5192    72.500000
235     74.200000
Name: overall_rating, Length: 7741, dtype: float64

# Training Results

In [47]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = model.score(X_test, y_test)

print("     Test RMSE: {:.5f}".format(rmse))
print("Test R^2 Score: {:.5f}".format(r2))

     Test RMSE: 1.53138
Test R^2 Score: 0.94025
