# Data Preprocessing - Single Imputation with Mean

##  Load and explore the dataset

In [24]:
# Load the pandas and numpy packages
import pandas as pd
import numpy as np

In [25]:
# Import csv file and save into data
data=pd.read_csv('../data/raw/2022_train.csv')

In [26]:
# Display the first 5 rows of data and all the columns
pd.set_option('max_columns', None)
data.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [27]:
# Display the summary of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV   

Based on the summary above, none of the columns in dataframe has missing values.

In [28]:
# Display the dimensions(shape) of data
data.shape

(8000, 21)

In [29]:
# Display the descriptive statistics
data.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,7798.5,62.78,18.58,7.27,2.81,6.23,44.61,0.26,0.82,19.58,1.39,1.95,71.37,1.08,2.17,3.25,1.62,0.65,0.25,1.26,0.83
std,2309.55,17.12,8.94,4.32,1.69,3.58,6.16,0.38,1.06,16.0,0.93,1.25,10.43,0.79,1.39,2.09,1.36,0.41,0.82,0.72,0.37
min,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,-38.5,0.0,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,8.4,0.7,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,19.5,1.2,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,30.6,1.9,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,82.1,8.1,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


**Unreasonable data based on descriptive summary**
- Games played cannot be negative
- 3P, 3PA and 3P% Made cannot be negative
- FT% can not be negative and cannot be over 100%
- BLK can not be negative

## Data Processing - Imputation

In [30]:
# Create a copy of data and save it into a variable data_cleaned
data_cleaned=data.copy()

In [31]:
# Remove the id column
data_cleaned.drop('Id',axis=1,inplace=True)

In [32]:
# Display the columns with negative values and save these column names into variable col_neg
col_neg=list(data_cleaned.columns[data_cleaned.min()<0])
col_neg

['GP', '3P Made', '3PA', '3P%', 'FT%', 'BLK']

In [33]:
# Replace the negative value with np.nan
for col in col_neg:
    data_cleaned[col] = np.where(data_cleaned[col]<0, np.nan, data_cleaned[col])

In [34]:
# Replace the value where FT%>100 with np.nan
data_cleaned['FT%'] = np.where(data_cleaned['FT%']>100, np.nan, data_cleaned['FT%'])

In [35]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'
imp_mean.fit(data_cleaned)
imputed_data_cleaned = imp_mean.transform(data_cleaned)

In [36]:
# Convert the imputed_data_cleaned into dataframe and add the column names
data_cleaned=pd.DataFrame(imputed_data_cleaned)
data_cleaned.columns=data.columns[1:]

In [37]:
# Display number of rows and columns after data cleansing
data_cleaned.shape

(8000, 20)

In [38]:
# Extract the column 'TARGET_5Yrs' and save it into variable called target
target=data_cleaned.pop('TARGET_5Yrs')

In [39]:
# Import StandardScaler from sklearn.preprocessing and instantiate the StandardScaler
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

In [40]:
# Fit and apply the scaling on data_cleaned
data_cleaned=scaler.fit_transform(data_cleaned)

In [41]:
# Import dump from joblib and save the scaler into the folder models and call the file scaler.joblib
from joblib import dump

dump(scaler, '../models/scaler_imputed.joblib')

['../models/scaler_imputed.joblib']

In [42]:
%load_ext autoreload
%autoreload 2

In [43]:
# Import function scaler_split_train_test from data.sets
import sys
sys.path.insert(1, '..')
from src.data.sets import split_train_test

In [44]:
# Split the scaler data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val=split_train_test(df=data_cleaned,target=target,test_ratio=0.2)

In [45]:
# Import the function save_sets from sets and save the sets into the folder data/processed
from src.data.sets import save_sets
save_sets(X_train, y_train, X_val, y_val, path='../data/processed/')

In [46]:
# Import the function load_sets from sets and load the sets from data/processed
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

## Basic Classification Models with Imputated Data

In [47]:
# Import lazypredict package
import lazypredict

In [48]:
# Import LazyClassifier from lazypredict.Supervised
from lazypredict.Supervised import LazyClassifier

In [49]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)

100%|██████████| 29/29 [00:15<00:00,  1.93it/s]


In [50]:
# Print the metrics of classifiers
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.64,0.63,0.68,0.69,0.02
GaussianNB,0.56,0.63,0.69,0.62,0.02
QuadraticDiscriminantAnalysis,0.68,0.62,0.68,0.72,0.03
LabelSpreading,0.76,0.56,0.63,0.76,3.63
LabelPropagation,0.76,0.56,0.61,0.76,2.98
DecisionTreeClassifier,0.75,0.56,0.56,0.76,0.09
BaggingClassifier,0.8,0.55,0.63,0.78,0.47
KNeighborsClassifier,0.81,0.53,0.6,0.78,0.18
ExtraTreeClassifier,0.73,0.53,0.53,0.74,0.02
LGBMClassifier,0.83,0.52,0.68,0.78,0.2


In [51]:
#  Import dump from joblib and save the fitted model into the folder models as a file called lazypredict_cleaned
from joblib import dump 

dump(clf,  '../models/lazypredict_imputed.joblib')

['../models/lazypredict_imputed.joblib']