In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt 
plt.rc("font", size=14)

import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from joblib import dump

In [None]:
df_2022_train = pd.read_csv('../data/raw/2022_train.csv')

df_main = df_2022_train.rename(columns = {
    'GP':'Games Played', 
    'MIN':'Minutes Played',
    'PTS':'Points Per Game',
    'FGM' : 'Field Goals Made',
    'FGA' : 'Field Goals Attempts',
    'FG%' : 'Field Goals Percent',
    '3P Made' : '3Points Made',
    '3PA' : '3Points Attempts',
    '3P%' : '3Points Percent',
    'FTM' : 'Free Throw Made',
    'FTA' : 'Free Throw Attempts',
    'FT%' : 'Free Throw Percent',
    'OREB' : 'Offensive Rebounds',
    'DREB' : 'Defensive Rebounds',
    'REB' : 'Rebounds',
    'AST' : 'Assists',
    'STL' : 'Steals',
    'BLK' : 'Blocks',
    'TOV' : 'Turnovers'
    })

df_main.head()

In [None]:
df_2022_test = pd.read_csv('../data/raw/2022_test.csv')

df_test = df_2022_test.rename(columns = {
    'GP':'Games Played', 
    'MIN':'Minutes Played',
    'PTS':'Points Per Game',
    'FGM' : 'Field Goals Made',
    'FGA' : 'Field Goals Attempts',
    'FG%' : 'Field Goals Percent',
    '3P Made' : '3Points Made',
    '3PA' : '3Points Attempts',
    '3P%' : '3Points Percent',
    'FTM' : 'Free Throw Made',
    'FTA' : 'Free Throw Attempts',
    'FT%' : 'Free Throw Percent',
    'OREB' : 'Offensive Rebounds',
    'DREB' : 'Defensive Rebounds',
    'REB' : 'Rebounds',
    'AST' : 'Assists',
    'STL' : 'Steals',
    'BLK' : 'Blocks',
    'TOV' : 'Turnovers'
    })

df_test.head()

## Standardise

#### Training Data

In [None]:
df_standard = df_main.copy()

In [None]:
target_var = df_standard.pop('TARGET_5Yrs')
target_var.head()

In [None]:
scaler = StandardScaler()
df_standard = scaler.fit_transform(df_standard)

# Save the scaler into the folder models and call the file scaler.joblib
# using joblib - dump
dump(scaler, '../models/scaler.joblib')

#### Test Data

In [None]:
X_test = df_test.copy()
X_test = scaler.fit_transform(X_test)

## Split into training and validation

#### Train data

In [None]:
# Split randomly the dataset with random_state=8 into 2 different sets: training data (80%) and validation data (20%)
X_train, X_val, y_train, y_val = train_test_split (df_standard, target_var, test_size=0.2, random_state=8)

# Save the different sets in the folder `data/processed`
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)

#### Test

In [None]:
# Save the different sets in the folder `data/processed`
np.save('../data/processed/X_test',  X_test)