## Soccer Player Performance Prediction

Given *data about soccer players*, let's try to predict the **average performance** of a given player.

We will use a random forest regression model to make our predictions.

Data source: https://www.kaggle.com/datasets/hugomathien/soccer

### Importing Libraries

In [56]:
import numpy as np
import pandas as pd
import sqlite3
pd.set_option("display.max_columns", None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.ensemble import RandomForestRegressor

In [57]:
connection = sqlite3.connect('database.sqlite')

players_df = pd.read_sql_query("SELECT * FROM Player", connection)
stats_df = pd.read_sql_query("SELECT * FROM Player_Attributes", connection)

In [58]:
players_df

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03 00:00:00,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18 00:00:00,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29 00:00:00,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06 00:00:00,185.42,172


In [59]:
stats_df

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,102359,39902,2009-08-30 00:00:00,83.0,85.0,right,medium,low,84.0,77.0,59.0,89.0,77.0,84.0,86.0,78.0,84.0,85.0,66.0,72.0,77.0,86.0,73.0,76.0,58.0,72.0,67.0,81.0,56.0,78.0,86.0,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,102359,39902,2009-02-22 00:00:00,78.0,80.0,right,medium,low,74.0,76.0,53.0,84.0,77.0,85.0,86.0,74.0,73.0,86.0,66.0,67.0,77.0,74.0,73.0,75.0,58.0,66.0,65.0,73.0,61.0,64.0,72.0,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,102359,39902,2008-08-30 00:00:00,77.0,80.0,right,medium,low,74.0,71.0,53.0,84.0,77.0,85.0,86.0,74.0,73.0,86.0,66.0,67.0,77.0,74.0,73.0,75.0,58.0,66.0,65.0,73.0,67.0,64.0,72.0,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,102359,39902,2007-08-30 00:00:00,78.0,81.0,right,medium,low,74.0,64.0,57.0,86.0,77.0,87.0,86.0,73.0,73.0,91.0,61.0,60.0,77.0,69.0,73.0,72.0,58.0,67.0,59.0,78.0,63.0,63.0,68.0,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


### Preprocessing

In [60]:
players = players_df.copy()
stats = stats_df.copy()

In [61]:
# Drop unused columns
players = players.drop(['id', 'player_name', 'player_fifa_api_id'], axis=1)
stats = stats.drop(['id', 'player_fifa_api_id', 'date'], axis=1)

In [62]:
## Players
players

Unnamed: 0,player_api_id,birthday,height,weight
0,505942,1992-02-29 00:00:00,182.88,187
1,155782,1989-12-15 00:00:00,170.18,146
2,162549,1991-05-13 00:00:00,170.18,163
3,30572,1982-05-08 00:00:00,182.88,198
4,23780,1979-11-08 00:00:00,182.88,154
...,...,...,...,...
11055,26357,1979-04-03 00:00:00,182.88,168
11056,111182,1986-12-18 00:00:00,182.88,176
11057,36491,1979-04-29 00:00:00,180.34,154
11058,35506,1981-10-06 00:00:00,185.42,172


In [63]:
# Extract birthday date features
players['birthday'] = pd.to_datetime(players['birthday'])
players['birth_year'] = players['birthday'].apply(lambda x: x.year)
players['birth_month'] = players['birthday'].apply(lambda x: x.month)
players['birth_day'] = players['birthday'].apply(lambda x: x.day)

In [64]:
players = players.drop('birthday', axis=1)
players

Unnamed: 0,player_api_id,height,weight,birth_year,birth_month,birth_day
0,505942,182.88,187,1992,2,29
1,155782,170.18,146,1989,12,15
2,162549,170.18,163,1991,5,13
3,30572,182.88,198,1982,5,8
4,23780,182.88,154,1979,11,8
...,...,...,...,...,...,...
11055,26357,182.88,168,1979,4,3
11056,111182,182.88,176,1986,12,18
11057,36491,180.34,154,1979,4,29
11058,35506,185.42,172,1981,10,6


In [65]:
## Stats
stats

Unnamed: 0,player_api_id,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,505942,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,505942,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,505942,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,505942,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,505942,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,39902,83.0,85.0,right,medium,low,84.0,77.0,59.0,89.0,77.0,84.0,86.0,78.0,84.0,85.0,66.0,72.0,77.0,86.0,73.0,76.0,58.0,72.0,67.0,81.0,56.0,78.0,86.0,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,39902,78.0,80.0,right,medium,low,74.0,76.0,53.0,84.0,77.0,85.0,86.0,74.0,73.0,86.0,66.0,67.0,77.0,74.0,73.0,75.0,58.0,66.0,65.0,73.0,61.0,64.0,72.0,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,39902,77.0,80.0,right,medium,low,74.0,71.0,53.0,84.0,77.0,85.0,86.0,74.0,73.0,86.0,66.0,67.0,77.0,74.0,73.0,75.0,58.0,66.0,65.0,73.0,67.0,64.0,72.0,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,39902,78.0,81.0,right,medium,low,74.0,64.0,57.0,86.0,77.0,87.0,86.0,73.0,73.0,91.0,61.0,60.0,77.0,69.0,73.0,72.0,58.0,67.0,59.0,78.0,63.0,63.0,68.0,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [66]:
stats[['preferred_foot', 'attacking_work_rate', 'defensive_work_rate']]

Unnamed: 0,preferred_foot,attacking_work_rate,defensive_work_rate
0,right,medium,medium
1,right,medium,medium
2,right,medium,medium
3,right,medium,medium
4,right,medium,medium
...,...,...,...
183973,right,medium,low
183974,right,medium,low
183975,right,medium,low
183976,right,medium,low


In [67]:
{column: stats[column].value_counts() for column in ['preferred_foot', 'attacking_work_rate', 'defensive_work_rate']}

{'preferred_foot': preferred_foot
 right    138409
 left      44733
 Name: count, dtype: int64,
 'attacking_work_rate': attacking_work_rate
 medium    125070
 high       42823
 low         8569
 None        3639
 norm         348
 y            106
 le           104
 stoc          89
 Name: count, dtype: int64,
 'defensive_work_rate': defensive_work_rate
 medium    130846
 high       27041
 low        18432
 _0          2394
 o           1550
 1            441
 ormal        348
 2            342
 3            258
 5            234
 7            217
 0            197
 6            197
 9            152
 4            116
 es           106
 ean          104
 tocky         89
 8             78
 Name: count, dtype: int64}

In [68]:
# Get categorical stats
categoricals = stats.groupby(by='player_api_id', as_index=False)[[
    'player_api_id', 
    'preferred_foot', 
    'attacking_work_rate', 
    'defensive_work_rate']].head(1)

In [69]:
categoricals

Unnamed: 0,player_api_id,preferred_foot,attacking_work_rate,defensive_work_rate
0,505942,right,medium,medium
5,155782,left,high,medium
38,162549,right,medium,medium
64,30572,right,medium,medium
87,23780,right,medium,medium
...,...,...,...,...
183924,26357,right,low,medium
183937,111182,left,high,medium
183953,36491,left,,_0
183960,35506,right,,_0


In [70]:
# Clean categorical columns
for column in ['attacking_work_rate', 'defensive_work_rate']:
    categoricals[column] = categoricals[column].apply(lambda x: np.NaN if x not in ['low', 'medium', 'high'] else x)
    categoricals[column] = categoricals[column].fillna(categoricals[column].mode()[0])

In [71]:
categoricals

Unnamed: 0,player_api_id,preferred_foot,attacking_work_rate,defensive_work_rate
0,505942,right,medium,medium
5,155782,left,high,medium
38,162549,right,medium,medium
64,30572,right,medium,medium
87,23780,right,medium,medium
...,...,...,...,...
183924,26357,right,low,medium
183937,111182,left,high,medium
183953,36491,left,medium,medium
183960,35506,right,medium,medium


In [72]:
categoricals.isna().sum()

player_api_id          0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
dtype: int64

In [73]:
{column: categoricals[column].value_counts() for column in ['preferred_foot', 'attacking_work_rate', 'defensive_work_rate']}

{'preferred_foot': preferred_foot
 right    8373
 left     2687
 Name: count, dtype: int64,
 'attacking_work_rate': attacking_work_rate
 medium    8120
 high      2375
 low        565
 Name: count, dtype: int64,
 'defensive_work_rate': defensive_work_rate
 medium    8464
 high      1555
 low       1041
 Name: count, dtype: int64}

In [74]:
# Take the average numeric stats within groups and merge with categorical columns
stats = stats.groupby(by = "player_api_id").mean(numeric_only=True)
stats = stats.merge(categoricals, on="player_api_id")

In [75]:
stats

Unnamed: 0,player_api_id,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate,defensive_work_rate
0,2625,60.142857,61.142857,50.142857,47.285714,46.285714,56.500000,38.000000,54.142857,50.000000,49.928571,65.571429,59.142857,66.857143,63.285714,66.428571,49.714286,66.214286,66.714286,58.714286,77.285714,55.785714,56.857143,71.285714,62.071429,50.357143,55.285714,61.928571,62.928571,63.857143,56.571429,12.428571,12.857143,19.071429,10.357143,10.428571,right,medium,medium
1,2752,69.380952,70.380952,36.428571,37.428571,75.333333,54.809524,20.428571,44.857143,30.428571,18.857143,59.571429,55.571429,41.619048,42.190476,43.333333,60.857143,44.571429,57.428571,56.523810,60.904762,82.238095,19.571429,80.285714,71.428571,31.428571,52.428571,35.428571,71.095238,70.666667,65.571429,11.095238,6.095238,7.095238,9.095238,15.095238,right,medium,medium
2,2768,69.285714,70.571429,42.238095,42.761905,66.666667,64.714286,28.761905,40.238095,51.761905,20.047619,57.000000,56.380952,56.952381,58.904762,58.047619,69.809524,60.190476,38.761905,76.904762,59.285714,67.428571,19.428571,62.761905,74.714286,44.190476,57.761905,31.904762,71.666667,70.619048,79.714286,10.761905,15.904762,22.714286,15.142857,12.095238,left,medium,medium
3,2770,71.133333,73.533333,61.866667,67.666667,67.800000,72.533333,67.133333,75.066667,79.800000,68.666667,70.266667,76.733333,62.200000,59.800000,62.600000,65.933333,61.266667,68.333333,57.133333,51.000000,62.600000,73.000000,56.800000,44.933333,64.533333,71.600000,63.933333,30.200000,37.533333,25.000000,8.666667,15.666667,27.200000,16.333333,17.000000,right,medium,low
4,2790,70.200000,75.800000,70.000000,37.400000,55.200000,68.000000,43.000000,60.800000,67.000000,55.400000,69.800000,66.800000,70.800000,71.200000,65.000000,69.200000,67.000000,46.400000,61.000000,70.400000,71.000000,42.400000,68.400000,63.400000,60.600000,60.000000,55.400000,74.600000,74.600000,69.000000,8.000000,17.000000,59.000000,16.600000,17.400000,left,medium,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11055,744907,51.909091,65.909091,42.454545,44.454545,47.454545,59.454545,42.454545,56.454545,46.454545,43.454545,58.454545,55.454545,66.000000,64.000000,61.000000,53.454545,74.000000,50.454545,63.000000,54.000000,49.909091,38.454545,36.454545,22.454545,48.454545,51.454545,52.454545,39.454545,48.454545,49.454545,8.454545,13.454545,12.454545,11.454545,8.454545,left,medium,medium
11056,746419,59.000000,66.000000,55.000000,28.000000,50.000000,35.000000,29.000000,67.000000,37.000000,39.000000,35.000000,55.000000,77.000000,76.000000,70.000000,60.000000,74.000000,29.000000,55.000000,63.000000,46.000000,27.000000,59.000000,53.000000,55.000000,40.000000,43.000000,55.000000,69.000000,62.000000,7.000000,10.000000,7.000000,9.000000,9.000000,right,high,medium
11057,748432,58.000000,68.000000,48.000000,26.000000,57.000000,49.000000,24.000000,41.000000,39.000000,20.000000,41.000000,42.000000,64.000000,68.000000,50.000000,48.000000,52.000000,46.000000,68.000000,50.000000,67.000000,25.000000,52.000000,55.000000,38.000000,36.000000,45.000000,63.000000,69.000000,68.000000,8.000000,8.000000,12.000000,12.000000,6.000000,right,medium,medium
11058,750435,56.444444,70.444444,35.000000,57.000000,55.000000,60.000000,52.000000,58.555556,56.000000,37.000000,54.000000,56.333333,65.666667,63.555556,60.222222,61.000000,78.000000,57.000000,58.000000,57.000000,46.000000,50.000000,34.000000,13.000000,56.333333,58.000000,61.000000,18.000000,19.000000,21.000000,9.000000,10.000000,8.000000,10.000000,11.000000,right,medium,low


In [76]:
stats.isna().sum()

player_api_id            0
overall_rating           0
potential                0
crossing                 0
finishing                0
heading_accuracy         0
short_passing            0
volleys                478
dribbling                0
curve                  478
free_kick_accuracy       0
long_passing             0
ball_control             0
acceleration             0
sprint_speed             0
agility                478
reactions                0
balance                478
shot_power               0
jumping                478
stamina                  0
strength                 0
long_shots               0
aggression               0
interceptions            0
positioning              0
vision                 478
penalties                0
marking                  0
standing_tackle          0
sliding_tackle         478
gk_diving                0
gk_handling              0
gk_kicking               0
gk_positioning           0
gk_reflexes              0
preferred_foot           0
a

In [77]:
stats[stats.isna().sum(axis=1) > 0]

Unnamed: 0,player_api_id,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate,defensive_work_rate
17,3263,59.666667,61.333333,47.0,44.000000,49.333333,63.0,,44.333333,,52.666667,42.333333,50.666667,62.333333,70.000000,,62.0,,48.000000,,76.000000,67.000000,31.000000,67.0,58.333333,54.333333,,54.000000,61.666667,61.333333,,6.000000,20.000000,42.333333,20.000000,20.000000,left,medium,medium
26,4747,69.400000,73.000000,23.4,27.000000,72.000000,51.8,,21.800000,,22.000000,48.000000,62.600000,41.600000,41.600000,,53.4,,41.200000,,70.400000,80.400000,32.400000,66.8,57.200000,57.200000,,55.200000,72.200000,72.000000,,11.000000,23.000000,48.000000,23.000000,23.000000,right,medium,medium
35,5184,62.000000,67.000000,57.0,62.000000,67.000000,58.0,,61.000000,,55.000000,40.000000,57.000000,63.000000,65.000000,,51.0,,71.000000,,64.000000,66.000000,61.000000,38.0,58.000000,47.000000,,42.000000,20.000000,20.000000,,8.000000,20.000000,40.000000,20.000000,20.000000,right,medium,medium
44,5362,60.000000,66.000000,55.0,59.000000,57.000000,58.0,,71.000000,,56.000000,53.000000,66.000000,70.000000,71.000000,,61.0,,62.000000,,60.000000,54.000000,54.000000,57.0,54.000000,55.000000,,54.000000,49.000000,56.000000,,2.000000,22.000000,53.000000,22.000000,22.000000,right,medium,medium
54,5700,64.333333,69.000000,59.0,65.000000,51.000000,58.0,,64.000000,,48.000000,51.666667,65.000000,71.333333,73.000000,,66.0,,67.000000,,65.666667,65.666667,62.666667,61.0,61.000000,57.666667,,52.666667,30.000000,33.000000,,14.000000,21.000000,51.666667,21.000000,21.000000,right,medium,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,181356,64.000000,68.000000,57.0,68.000000,56.000000,65.0,,63.000000,,57.000000,66.000000,56.000000,67.000000,68.000000,,69.0,,63.000000,,64.000000,67.000000,68.000000,59.0,63.000000,68.000000,,69.000000,47.000000,50.000000,,1.000000,23.000000,66.000000,23.000000,23.000000,right,medium,medium
7500,181376,63.333333,68.000000,54.0,59.333333,60.000000,52.0,,63.000000,,42.000000,45.000000,63.000000,73.000000,77.000000,,79.0,,58.000000,,59.000000,58.000000,66.000000,67.0,53.000000,45.000000,,48.000000,23.000000,22.000000,,6.000000,23.000000,45.000000,23.000000,23.000000,right,medium,medium
7621,184621,61.000000,65.666667,22.0,23.000000,23.000000,23.0,,23.000000,,22.000000,60.333333,23.000000,44.666667,39.333333,,32.0,,23.000000,,48.333333,36.000000,23.000000,37.0,54.000000,41.000000,,49.000000,23.000000,23.000000,,64.333333,61.666667,60.333333,61.666667,63.333333,right,medium,medium
7624,184644,56.666667,66.666667,36.0,59.333333,55.000000,53.0,,60.333333,,40.000000,37.000000,64.000000,68.333333,65.333333,,52.0,,50.333333,,53.000000,51.666667,47.333333,43.0,53.000000,57.000000,,52.000000,28.000000,27.000000,,10.000000,23.000000,37.000000,23.000000,23.000000,right,medium,medium


In [78]:
stats.loc[:, stats.isna().sum() > 0].columns

Index(['volleys', 'curve', 'agility', 'balance', 'jumping', 'vision',
       'sliding_tackle'],
      dtype='object')

In [79]:
# Fill numeric missing values with column means
for column in stats.loc[:, stats.isna().sum() > 0].columns:
    stats[column] = stats[column].fillna(stats[column].mean())

In [80]:
stats.isna().sum()

player_api_id          0
overall_rating         0
potential              0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                0
dribbling              0
curve                  0
free_kick_accuracy     0
long_passing           0
ball_control           0
acceleration           0
sprint_speed           0
agility                0
reactions              0
balance                0
shot_power             0
jumping                0
stamina                0
strength               0
long_shots             0
aggression             0
interceptions          0
positioning            0
vision                 0
penalties              0
marking                0
standing_tackle        0
sliding_tackle         0
gk_diving              0
gk_handling            0
gk_kicking             0
gk_positioning         0
gk_reflexes            0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
dtype: int64

In [81]:
players.isna().sum()

player_api_id    0
height           0
weight           0
birth_year       0
birth_month      0
birth_day        0
dtype: int64

In [82]:
## Final Merge
# Create a single df
df = players.merge(stats, on='player_api_id')
df

Unnamed: 0,player_api_id,height,weight,birth_year,birth_month,birth_day,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate,defensive_work_rate
0,505942,182.88,187,1992,2,29,63.600000,67.600000,48.600000,43.600000,70.600000,60.600000,43.600000,50.600000,44.600000,38.600000,63.600000,48.600000,60.000000,64.000000,59.000000,46.600000,65.000000,54.600000,58.000000,54.000000,76.000000,34.600000,65.800000,52.200000,44.600000,53.600000,47.600000,63.800000,66.000000,67.800000,5.600000,10.600000,9.600000,7.600000,7.600000,right,medium,medium
1,155782,170.18,146,1989,12,15,66.969697,74.484848,70.787879,49.454545,52.939394,62.272727,29.151515,61.090909,61.878788,62.121212,63.242424,61.787879,76.000000,74.939394,75.242424,67.848485,84.727273,65.909091,75.303030,72.878788,51.757576,54.121212,65.060606,57.878788,51.484848,57.454545,53.121212,69.393939,68.787879,71.515152,12.181818,8.666667,14.242424,10.363636,12.909091,left,high,medium
2,162549,170.18,163,1991,5,13,67.000000,74.192308,68.115385,57.923077,58.692308,65.115385,54.269231,69.038462,60.192308,55.615385,60.461538,68.615385,75.538462,77.500000,77.615385,50.346154,80.730769,62.807692,67.307692,70.923077,70.076923,58.038462,59.115385,47.269231,61.807692,69.384615,60.538462,22.038462,21.115385,21.346154,14.038462,11.807692,17.730769,10.115385,13.500000,right,medium,medium
3,30572,182.88,198,1982,5,8,69.086957,70.782609,57.217391,26.260870,69.260870,64.695652,47.782609,55.565217,37.782609,40.391304,60.826087,63.173913,50.217391,52.826087,62.826087,58.217391,44.086957,64.869565,70.782609,55.434783,78.695652,31.304348,69.304348,61.565217,35.913043,53.782609,41.739130,70.608696,70.652174,68.043478,14.173913,11.173913,22.869565,11.173913,10.173913,right,medium,medium
4,23780,182.88,154,1979,11,8,73.240000,74.680000,45.080000,38.840000,73.040000,64.760000,32.080000,50.600000,45.480000,26.360000,56.840000,60.160000,51.120000,54.080000,50.760000,69.400000,64.160000,47.080000,77.840000,66.480000,73.800000,24.280000,69.680000,76.840000,40.080000,46.480000,52.960000,77.600000,76.040000,74.600000,8.280000,8.320000,24.920000,12.840000,11.920000,right,medium,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11055,26357,182.88,168,1979,4,3,74.384615,75.461538,42.000000,27.000000,75.153846,70.000000,30.000000,56.384615,49.000000,35.692308,54.538462,66.846154,61.153846,64.000000,65.846154,68.769231,69.076923,57.000000,80.461538,60.000000,74.846154,30.000000,74.923077,66.538462,37.384615,60.076923,55.538462,78.769231,76.307692,72.692308,12.846154,12.769231,25.461538,11.384615,13.615385,right,low,medium
11056,111182,182.88,176,1986,12,18,65.687500,71.625000,67.250000,46.750000,60.312500,63.125000,54.562500,61.187500,63.812500,61.750000,62.312500,64.625000,71.625000,68.875000,62.437500,61.500000,59.875000,40.312500,67.875000,76.812500,71.375000,37.875000,66.250000,66.062500,59.625000,44.687500,51.875000,63.250000,65.250000,63.875000,9.000000,8.000000,5.000000,5.000000,8.000000,left,high,medium
11057,36491,180.34,154,1979,4,29,67.571429,72.857143,63.142857,44.571429,59.857143,66.857143,57.000000,66.857143,60.000000,52.571429,58.285714,68.142857,71.000000,72.142857,59.000000,69.142857,72.000000,60.714286,42.000000,70.000000,66.571429,56.714286,71.000000,68.285714,61.571429,66.000000,61.000000,61.000000,64.285714,59.000000,7.142857,18.714286,42.428571,16.714286,16.714286,left,medium,medium
11058,35506,185.42,172,1981,10,6,70.750000,78.125000,46.750000,43.000000,79.000000,58.875000,59.000000,36.000000,29.000000,41.375000,50.625000,62.000000,65.000000,61.500000,56.000000,62.750000,53.000000,50.000000,68.000000,70.500000,68.000000,46.000000,71.000000,72.250000,56.500000,63.000000,64.000000,70.750000,73.250000,65.000000,14.000000,18.500000,41.625000,19.000000,20.000000,right,medium,medium


In [83]:
df = df.drop('player_api_id', axis=1)
df

Unnamed: 0,height,weight,birth_year,birth_month,birth_day,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate,defensive_work_rate
0,182.88,187,1992,2,29,63.600000,67.600000,48.600000,43.600000,70.600000,60.600000,43.600000,50.600000,44.600000,38.600000,63.600000,48.600000,60.000000,64.000000,59.000000,46.600000,65.000000,54.600000,58.000000,54.000000,76.000000,34.600000,65.800000,52.200000,44.600000,53.600000,47.600000,63.800000,66.000000,67.800000,5.600000,10.600000,9.600000,7.600000,7.600000,right,medium,medium
1,170.18,146,1989,12,15,66.969697,74.484848,70.787879,49.454545,52.939394,62.272727,29.151515,61.090909,61.878788,62.121212,63.242424,61.787879,76.000000,74.939394,75.242424,67.848485,84.727273,65.909091,75.303030,72.878788,51.757576,54.121212,65.060606,57.878788,51.484848,57.454545,53.121212,69.393939,68.787879,71.515152,12.181818,8.666667,14.242424,10.363636,12.909091,left,high,medium
2,170.18,163,1991,5,13,67.000000,74.192308,68.115385,57.923077,58.692308,65.115385,54.269231,69.038462,60.192308,55.615385,60.461538,68.615385,75.538462,77.500000,77.615385,50.346154,80.730769,62.807692,67.307692,70.923077,70.076923,58.038462,59.115385,47.269231,61.807692,69.384615,60.538462,22.038462,21.115385,21.346154,14.038462,11.807692,17.730769,10.115385,13.500000,right,medium,medium
3,182.88,198,1982,5,8,69.086957,70.782609,57.217391,26.260870,69.260870,64.695652,47.782609,55.565217,37.782609,40.391304,60.826087,63.173913,50.217391,52.826087,62.826087,58.217391,44.086957,64.869565,70.782609,55.434783,78.695652,31.304348,69.304348,61.565217,35.913043,53.782609,41.739130,70.608696,70.652174,68.043478,14.173913,11.173913,22.869565,11.173913,10.173913,right,medium,medium
4,182.88,154,1979,11,8,73.240000,74.680000,45.080000,38.840000,73.040000,64.760000,32.080000,50.600000,45.480000,26.360000,56.840000,60.160000,51.120000,54.080000,50.760000,69.400000,64.160000,47.080000,77.840000,66.480000,73.800000,24.280000,69.680000,76.840000,40.080000,46.480000,52.960000,77.600000,76.040000,74.600000,8.280000,8.320000,24.920000,12.840000,11.920000,right,medium,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11055,182.88,168,1979,4,3,74.384615,75.461538,42.000000,27.000000,75.153846,70.000000,30.000000,56.384615,49.000000,35.692308,54.538462,66.846154,61.153846,64.000000,65.846154,68.769231,69.076923,57.000000,80.461538,60.000000,74.846154,30.000000,74.923077,66.538462,37.384615,60.076923,55.538462,78.769231,76.307692,72.692308,12.846154,12.769231,25.461538,11.384615,13.615385,right,low,medium
11056,182.88,176,1986,12,18,65.687500,71.625000,67.250000,46.750000,60.312500,63.125000,54.562500,61.187500,63.812500,61.750000,62.312500,64.625000,71.625000,68.875000,62.437500,61.500000,59.875000,40.312500,67.875000,76.812500,71.375000,37.875000,66.250000,66.062500,59.625000,44.687500,51.875000,63.250000,65.250000,63.875000,9.000000,8.000000,5.000000,5.000000,8.000000,left,high,medium
11057,180.34,154,1979,4,29,67.571429,72.857143,63.142857,44.571429,59.857143,66.857143,57.000000,66.857143,60.000000,52.571429,58.285714,68.142857,71.000000,72.142857,59.000000,69.142857,72.000000,60.714286,42.000000,70.000000,66.571429,56.714286,71.000000,68.285714,61.571429,66.000000,61.000000,61.000000,64.285714,59.000000,7.142857,18.714286,42.428571,16.714286,16.714286,left,medium,medium
11058,185.42,172,1981,10,6,70.750000,78.125000,46.750000,43.000000,79.000000,58.875000,59.000000,36.000000,29.000000,41.375000,50.625000,62.000000,65.000000,61.500000,56.000000,62.750000,53.000000,50.000000,68.000000,70.500000,68.000000,46.000000,71.000000,72.250000,56.500000,63.000000,64.000000,70.750000,73.250000,65.000000,14.000000,18.500000,41.625000,19.000000,20.000000,right,medium,medium


In [88]:
def one_hot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [89]:
# Binary Encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})

In [91]:
# One hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
    df = one_hot_encode(df, column=column)

In [92]:
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)

In [93]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [94]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [95]:
X_train

Unnamed: 0,height,weight,birth_year,birth_month,birth_day,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate_high,attacking_work_rate_low,attacking_work_rate_medium,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium
5429,0.149907,-0.033384,0.076129,0.306169,1.310542,0.035981,-2.226408,-1.761449,-2.491015,-2.570139,-2.019655,-2.361242,-1.987480,-2.064442,-1.257468,-3.043471,-1.604453,-1.103804,0.307301,-0.517291,-1.035053,-2.501167,1.511246,-1.322517,-0.427244,-1.856978,-2.301607,-1.447759,-2.252874,-1.740026,-1.814019,-1.463532,-1.634102,-1.652956,3.655153,2.988354,2.500292,2.649907,3.479833,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
5874,0.944722,1.230136,0.259888,-0.858476,-0.392766,1.308621,0.061943,1.711121,1.384466,0.416901,1.667145,1.056062,1.078016,1.002833,0.223666,1.109720,-0.404076,0.034740,-0.366976,1.213786,-0.563950,1.248801,0.958543,0.243247,1.347404,1.139239,0.195791,-1.313082,1.389715,0.816145,0.527774,-1.092226,-1.088564,-1.470356,-0.217026,-0.483761,-0.231833,-0.429216,-0.042825,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,3.092040,-1.79019
755,0.149907,-0.033384,0.443648,-0.276154,-0.392766,-1.309262,-0.887226,0.917262,0.146312,-0.619679,0.746313,-0.613255,-0.498342,-0.369296,-0.533978,-0.292745,0.296813,0.319544,0.178120,-0.590706,0.069568,0.594420,-0.442540,-0.605161,0.319088,-0.201712,0.655742,-0.456045,0.211569,0.177765,0.922862,-0.059576,-0.289261,-0.205148,-0.418178,-0.216831,-0.590616,-0.445620,-0.544244,0.564172,1.919853,-0.235574,-1.661202,-0.40871,-0.323411,0.55860
6074,0.944722,0.831130,0.259888,-0.276154,-0.165659,-1.265387,-0.887784,-1.025011,0.557556,-0.269012,-1.176538,-0.106941,-1.221106,-0.576358,0.435075,-0.403009,-1.116889,-0.823979,-0.662321,-0.691227,-0.362264,-0.690687,-0.359442,-0.643871,0.422030,-0.317549,0.039703,0.555328,-1.664524,-0.624738,-1.081311,0.652664,0.605221,0.709280,-0.533403,-0.242982,-0.798593,-0.557207,-0.245866,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
2487,-1.042316,-0.764895,1.362444,1.761975,-0.052105,0.320386,0.377154,0.674039,-1.586912,0.409526,1.173613,0.894970,1.434878,-0.819310,0.205837,0.605477,0.918904,0.472983,1.143182,-1.262731,1.227143,0.490061,-0.316926,-0.129343,-0.801655,0.466579,-2.047408,-1.720908,0.426057,0.658207,-0.693406,-1.550363,-1.394000,-1.343114,-0.002833,-0.370829,-0.516155,-0.305421,-0.186728,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,0.149907,-0.232887,-0.291389,0.306169,0.175003,-2.092746,-2.140441,-1.863503,0.506799,-0.549277,-0.236478,-0.578429,-1.841225,1.063571,-0.634704,-0.348497,-1.219534,-1.214845,-1.698814,-0.890011,-1.752995,-0.690687,0.533387,-0.661614,0.294182,-0.974833,-0.174359,0.327705,-2.660039,-0.581002,-0.765241,0.649356,0.566020,0.292853,-0.238642,-0.626523,-0.798593,-0.305421,-0.659832,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
10955,-0.247501,-1.230403,1.362444,0.015008,0.515665,0.369634,0.245573,0.256369,-0.852754,0.672933,0.560895,1.122295,0.298270,-0.237960,0.478739,0.985119,0.369020,0.143032,0.271478,-1.653199,-0.308080,0.396351,-2.336419,-1.371309,-1.623533,-0.027619,-1.808499,-1.135590,0.298062,0.772756,0.712490,-0.876979,-0.777994,-0.769464,-0.263907,-0.526072,-0.483876,-0.080613,-0.507763,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
905,-0.644908,-0.232887,1.729962,0.015008,0.175003,0.010126,1.175416,-1.036043,0.316462,-0.180507,-1.059031,-0.106941,-1.080701,-1.123001,0.282249,-0.116817,-0.107546,-0.060015,-0.528580,-1.262731,0.865914,-2.002629,-0.316926,0.349702,-1.057350,-1.205460,0.962848,0.612234,-1.067215,-0.508107,-1.196246,0.980141,0.762022,0.639876,-0.356546,-0.306906,-0.798593,-0.242475,-0.068452,0.564172,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860
5192,-1.042316,-1.695910,-1.577704,1.470814,-0.165659,0.772848,1.198443,1.294633,0.736790,0.455622,1.100171,1.130714,1.317874,0.934503,0.721623,0.784347,0.427063,0.939356,0.735690,0.492159,-0.127465,0.834446,-0.104348,0.802132,0.088713,0.985488,0.043048,0.726046,0.903904,0.803996,0.366147,-1.258859,-1.473626,-0.946516,-0.282856,-0.003269,1.545649,0.127334,0.035040,-1.772509,-0.520873,-0.235574,0.601974,-0.40871,-0.323411,0.55860


In [96]:
X_test

Unnamed: 0,height,weight,birth_year,birth_month,birth_day,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot,attacking_work_rate_high,attacking_work_rate_low,attacking_work_rate_medium,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium
9853,1.739537,1.562641,0.994925,0.015008,0.742772,0.738991,-1.973771,-1.516758,-2.157924,-2.509029,-1.596208,-2.203378,-1.665720,-1.548168,-2.545023,-2.363522,-1.928274,-2.065104,-2.152578,0.458880,-2.204531,-2.564890,-0.726898,-2.892084,-0.201553,-1.716131,-2.085633,-1.834719,-2.159437,-1.820210,-2.027469,-1.203039,-1.429000,-1.328950,3.298492,3.482849,2.284025,3.111663,3.192591,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
6847,-1.042316,-0.033384,-1.210186,-1.149637,-0.506320,0.545789,0.797541,0.996536,-0.117898,1.056292,1.349874,0.985641,0.849859,1.521443,1.246226,0.951423,0.353040,0.247484,0.011528,1.384536,-0.023265,0.929057,-0.472272,0.498464,0.041297,1.078625,-0.086882,0.227025,1.390941,1.577801,0.964305,-0.496940,0.426557,0.338614,-0.202364,-0.272485,0.248603,-0.494260,-0.318651,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
9043,-0.247501,-0.498891,0.259888,-1.149637,-0.960536,0.341070,1.050151,-0.067365,-0.120045,0.235466,0.071392,0.359832,1.156409,0.665129,0.716274,0.553690,0.905218,1.069941,1.159900,0.973589,0.674463,-0.008477,0.954291,1.492310,0.604670,0.229034,0.617672,1.040167,0.313763,0.159608,0.243311,0.900753,0.901184,1.450919,-0.210345,-0.314576,0.123287,-0.408653,-0.042431,-1.772509,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
2939,-0.247501,-0.964398,1.178685,-0.567315,0.515665,-0.541447,-0.032211,-0.125838,0.443354,0.262017,-0.471493,0.482418,0.342843,0.537174,0.587900,0.469195,0.106298,-0.015599,0.363027,-0.641531,0.324071,0.358867,-0.423215,-0.765111,0.020223,0.187906,0.360797,-0.070637,-0.380310,0.366628,-0.406070,0.037404,0.223017,0.648138,-0.238642,-0.562600,-0.572642,-0.494260,-0.127590,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
4257,-0.644908,-1.363405,-1.393945,-1.440799,-0.846982,-0.972363,0.553693,-0.656791,0.506799,0.640007,-0.530247,0.173005,-0.371367,0.691550,0.654762,0.281807,-0.374851,-0.448660,0.098331,0.228149,0.549839,0.211274,0.241092,0.114615,0.328427,0.127847,0.436054,-0.027957,-0.566969,0.749325,0.195541,0.514975,0.841648,0.800994,0.115072,-0.123125,0.599479,-0.077241,0.027648,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6702,1.739537,2.294153,-2.496500,0.597330,0.402111,2.345694,-0.390405,-0.305121,1.997776,0.123728,0.879844,-0.622631,-0.510309,0.797842,0.301353,-0.144074,-0.321390,0.073234,-1.051006,0.895939,1.317450,0.801647,1.702566,0.513818,1.858034,0.726034,2.008076,1.522729,0.426057,0.348405,0.402064,1.656182,1.546030,1.395891,-0.327070,0.012712,1.234966,0.001442,-0.098021,-1.772509,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
2510,-0.247501,-0.232887,-0.475149,-1.440799,0.629219,1.354586,-0.093616,0.563711,0.675988,0.704542,0.005058,0.207384,-0.002886,1.650706,0.282249,0.423768,-0.535234,-0.060015,0.007203,0.269562,0.005781,0.577524,0.006023,-0.395478,-0.527695,1.139239,0.628376,0.669140,1.023366,-0.007542,0.958779,0.583199,0.402685,0.001584,-0.415499,0.268407,2.025795,0.261095,0.227239,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860
10142,1.739537,0.831130,0.994925,-1.440799,-1.074090,-0.445688,-0.052679,-0.631508,0.534998,-0.032999,-0.647754,-0.237910,-1.314709,-0.697834,0.426585,-0.174359,-1.133997,-0.543662,-0.667894,-0.517291,-0.980367,-0.326258,0.698725,-0.198341,1.136354,-0.398268,-0.330447,0.599588,-0.602642,-0.653896,-1.052577,0.864366,0.778356,0.755550,-0.356546,-0.051211,-0.798593,-0.620153,-0.186728,-1.772509,-0.520873,-0.235574,0.601974,2.446725,-0.323411,-1.79019
10750,1.342130,2.427155,1.729962,1.470814,1.196988,0.535844,-0.728131,0.784367,0.697137,0.188263,0.527321,0.025665,-0.437181,0.091761,-1.628070,0.077384,-0.706309,0.117651,-1.782402,-1.293791,-2.385146,0.490061,-0.529504,-1.016461,2.120578,0.062984,-0.308149,-1.720908,0.231932,0.111497,0.455940,-1.649598,-1.394000,-1.590988,-0.179690,-0.242982,-0.516155,-0.557207,-0.659832,0.564172,-0.520873,-0.235574,0.601974,-0.408710,-0.323411,0.55860


### Training/Results

In [97]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = model.score(X_test, y_test)

print("     Test RMSE: {:.5f}".format(rmse))
print("Test R^2 Score: {:.5f}".format(r2))

     Test RMSE: 1.51451
Test R^2 Score: 0.94156


In [98]:
np.mean(y_test)

66.6136457768262