In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from joblib import dump

In [2]:
df_2022_train = pd.read_csv('../data/raw/2022_train.csv')

df_main = df_2022_train.rename(columns = {
    'GP':'Games Played', 
    'MIN':'Minutes Played',
    'PTS':'Points Per Game',
    'FGM' : 'Field Goals Made',
    'FGA' : 'Field Goals Attempts',
    'FG%' : 'Field Goals Percent',
    '3P Made' : '3Points Made',
    '3PA' : '3Points Attempts',
    '3P%' : '3Points Percent',
    'FTM' : 'Free Throw Made',
    'FTA' : 'Free Throw Attempts',
    'FT%' : 'Free Throw Percent',
    'OREB' : 'Offensive Rebounds',
    'DREB' : 'Defensive Rebounds',
    'REB' : 'Rebounds',
    'AST' : 'Assists',
    'STL' : 'Steals',
    'BLK' : 'Blocks',
    'TOV' : 'Turnovers'
    })

df_main.head()

Unnamed: 0,Id,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,...,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,...,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,...,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,...,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,...,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,...,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [3]:
df_2022_test = pd.read_csv('../data/raw/2022_test.csv')

df_test = df_2022_test.rename(columns = {
    'GP':'Games Played', 
    'MIN':'Minutes Played',
    'PTS':'Points Per Game',
    'FGM' : 'Field Goals Made',
    'FGA' : 'Field Goals Attempts',
    'FG%' : 'Field Goals Percent',
    '3P Made' : '3Points Made',
    '3PA' : '3Points Attempts',
    '3P%' : '3Points Percent',
    'FTM' : 'Free Throw Made',
    'FTA' : 'Free Throw Attempts',
    'FT%' : 'Free Throw Percent',
    'OREB' : 'Offensive Rebounds',
    'DREB' : 'Defensive Rebounds',
    'REB' : 'Rebounds',
    'AST' : 'Assists',
    'STL' : 'Steals',
    'BLK' : 'Blocks',
    'TOV' : 'Turnovers'
    })

df_test.head()

Unnamed: 0,Id,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,Free Throw Made,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers
0,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,7.3,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,35.1,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,0.0,1.8
2,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,44.8,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,13.5,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,38.7,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9


## Standardise

#### Training Data

In [4]:
df_standard = df_main.copy()

In [5]:
target_var = df_standard.pop('TARGET_5Yrs')
target_var.head()

0    1
1    1
2    1
3    1
4    1
Name: TARGET_5Yrs, dtype: int64

In [6]:
scaler = StandardScaler()
df_standard = scaler.fit_transform(df_standard)

# Save the scaler into the folder models and call the file scaler.joblib
# using joblib - dump
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

#### Test Data

In [7]:
X_test = df_test.copy()
X_test = scaler.fit_transform(X_test)

## Split into training and validation

#### Train data

In [8]:
# Split randomly the dataset with random_state=8 into 2 different sets: training data (80%) and validation data (20%)
X_train, X_val, y_train, y_val = train_test_split (df_standard, target_var, test_size=0.2, random_state=8)

# Save the different sets in the folder `data/processed`
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)

In [9]:
X_train = pd.DataFrame(X_train, 
             columns=['ID',
                 'Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers'])

X_train.head()

Unnamed: 0,ID,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,Free Throw Made,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers
0,-0.165627,0.071395,-0.523427,-0.68707,-0.771903,-0.734087,-0.261394,-1.209483,-0.86395,-0.480167,0.008072,-0.038161,-0.207657,-0.608229,-0.336533,-0.501337,-0.755595,-0.610125,-0.176876,-0.771215
1,-1.24686,1.064519,0.248843,0.077091,-0.063214,-0.008708,0.031048,0.352736,0.267168,-0.467668,0.439993,0.121549,0.731958,0.791937,1.315603,1.129339,-0.681844,-0.119449,-0.176876,0.058402
2,-0.054776,-0.746471,-1.452389,-1.358606,-1.362477,-1.31997,-1.333681,-0.688743,-0.298391,-1.41754,-1.287692,-1.395691,-0.265185,-1.244669,-1.414013,-1.364637,-0.09183,-0.610125,-0.664095,-1.324294
3,-1.665583,0.538748,0.080958,-0.524976,-0.476616,-0.343498,-0.68381,-0.428374,-0.298391,0.357221,-0.53183,-0.277725,-2.201944,-0.480941,-0.121037,-0.16561,-0.386837,-0.364787,-0.055071,-0.494676
4,-0.450117,-1.038567,0.125727,-0.270255,-0.181328,-0.092406,-0.407615,-0.428374,-0.392651,-1.392544,-0.315869,-0.19787,-0.897987,-0.099078,-0.480197,-0.309493,-0.09183,-0.610125,0.066734,0.749749


In [10]:
X_val = pd.DataFrame(X_val, 
             columns=['ID',
                 'Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers'])

X_val.head()

Unnamed: 0,ID,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,Free Throw Made,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers
0,0.561834,-1.213824,-1.250927,-1.080729,-1.06719,-1.152574,0.37223,-0.949113,-1.052469,0.313477,-1.071732,-1.156127,0.942892,-0.735517,-0.983021,-0.980948,-0.97685,-0.855464,-0.298681,-1.324294
1,1.439118,-0.2207,-0.668927,-1.034416,-0.890017,-0.845683,-0.553836,-0.168004,-0.392651,-1.436288,-1.395672,-1.315836,-0.84046,-0.226366,-0.480197,-0.453376,-0.165582,-0.610125,-0.055071,-0.356407
2,1.493244,0.42191,-0.277196,1.350692,1.472279,1.386251,0.502204,-0.168004,-0.109872,-0.211453,1.087875,1.079806,0.453909,1.173801,-0.264701,0.170118,-0.903099,-0.610125,0.188539,0.196671
3,-1.027323,-0.863309,0.204074,0.517062,0.586418,0.04709,2.500557,-0.949113,-1.052469,-1.573769,0.439993,0.600677,-1.013042,3.083119,-0.049205,0.841573,-0.460589,-0.119449,0.188539,1.164558
4,-1.446479,-0.045443,-0.44508,-0.872322,-0.890017,-0.62249,-1.804838,0.873476,0.926986,0.588439,-1.071732,-1.235982,0.875777,-0.735517,-0.623861,-0.693181,-0.018078,0.616566,-0.176876,-0.218137


In [11]:
y_train.head()

3617    1
1120    1
3873    1
153     1
2960    1
Name: TARGET_5Yrs, dtype: int64

In [12]:
y_val.head()

5297    0
7323    1
7448    1
1627    1
659     1
Name: TARGET_5Yrs, dtype: int64

#### Test

In [13]:
# Save the different sets in the folder `data/processed`
np.save('../data/processed/X_test',  X_test)

In [14]:
X_test

array([[-1.73159494, -0.3996569 , -1.09444225, ..., -1.10492692,
         0.06609733, -0.64281172],
       [-1.7306831 , -1.15769727,  0.07446345, ..., -0.13054963,
        -0.40296321,  0.76098325],
       [-1.72977125,  1.11642385,  1.74760297, ...,  1.57461062,
         0.06609733,  1.04174224],
       ...,
       [ 1.72977125, -0.57458929, -1.00276337, ..., -0.61773827,
        -0.09025619, -1.06395021],
       [ 1.7306831 ,  1.52459944,  2.2518368 , ...,  1.57461062,
         0.06609733,  1.60326023],
       [ 1.73159494, -0.4579677 , -0.76210631, ..., -0.86133259,
        -0.09025619, -0.08129373]])

In [15]:
X_test = pd.DataFrame(X_test, 
             columns=['ID',
                 'Games Played', 
    'Minutes Played',
    'Points Per Game',
    'Field Goals Made',
    'Field Goals Attempts',
    'Field Goals Percent',
    '3Points Made',
    '3Points Attempts',
    '3Points Percent',
    'Free Throw Made',
    'Free Throw Attempts',
    'Free Throw Percent',
    'Offensive Rebounds',
    'Defensive Rebounds',
    'Rebounds',
    'Assists',
    'Steals',
    'Blocks',
    'Turnovers'])

X_test.head()

Unnamed: 0,ID,Games Played,Minutes Played,Points Per Game,Field Goals Made,Field Goals Attempts,Field Goals Percent,3Points Made,3Points Attempts,3Points Percent,Free Throw Made,Free Throw Attempts,Free Throw Percent,Offensive Rebounds,Defensive Rebounds,Rebounds,Assists,Steals,Blocks,Turnovers
0,-1.731595,-0.399657,-1.094442,-0.775014,-0.731786,-0.727231,-0.14887,-0.409418,-0.472033,-0.747469,-0.755754,-0.602751,-0.785478,0.132355,-1.005643,-0.76111,-0.925982,-1.104927,0.066097,-0.642812
1,-1.730683,-1.157697,0.074463,0.64552,0.512139,0.502248,0.231965,0.903137,0.857851,0.993635,0.432127,0.437073,0.352629,-0.758712,-0.932743,-0.857711,1.395558,-0.13055,-0.402963,0.760983
2,-1.729771,1.116424,1.747603,0.92497,1.222953,1.200816,0.165733,0.640626,1.047834,1.601142,0.432127,0.597045,-0.039492,0.25965,0.816841,0.591302,0.646674,1.574611,0.066097,1.041742
3,-1.728859,1.349667,2.985268,2.671528,2.348409,2.68178,-0.281334,0.640626,0.952843,-0.359165,3.347837,3.476558,-0.068184,0.514241,0.743942,0.832805,1.844888,0.600233,-0.24661,3.287814
4,-1.727948,-0.283035,-0.727727,-0.612002,-0.731786,-0.643403,-0.761516,0.640626,0.857851,1.219101,-0.323797,-0.522765,0.505652,-1.140598,-1.151442,-1.147514,-0.10221,-0.374144,-1.028377,-0.502432
