# Data Preprocessing and Model Performance Comparison with Lazypredict

##  Load and explore the dataset

In [20]:
# Load the pandas and numpy packages
import pandas as pd
import numpy as np

In [2]:
# Import csv file and save into data
data=pd.read_csv('../data/raw/2022_train.csv')

In [3]:
# Display the first 5 rows of data and all the columns
pd.set_option('max_columns', None)
data.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [4]:
# Display the summary of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV   

Based on the summary above, none of the columns in dataframe has missing values.

In [5]:
# Display the dimensions(shape) of data
data.shape

(8000, 21)

In [6]:
# Display the descriptive statistics
data.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,7798.5,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,19.5837,1.392525,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,2309.54541,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,16.003155,0.926153,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,-38.5,0.0,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,8.4,0.7,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,19.5,1.2,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,30.6,1.9,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,82.1,8.1,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


**Unreasonable data based on descriptive summary**
- Games played cannot be negative
- 3P, 3PA and 3P% Made cannot be negative
- FT% can not be negative and cannot be over 100%
- BLK can not be negative

##  Split Train and Test Sets for Raw Data

In [7]:
# Create a copy of data and save it into a variable data_cleaned
data_cleaned=data.copy()

In [8]:
# Remove the id column
data_cleaned.drop('Id',axis=1,inplace=True)

In [9]:
# Extract the column 'TARGET_5Yrs' and save it into variable called target
target=data_cleaned.pop('TARGET_5Yrs')

In [10]:
%load_ext autoreload
%autoreload 2

In [11]:
# Import function scaler_split_train_test from data.sets
import sys
sys.path.insert(1, '..')
from src.data.sets import split_train_test

In [12]:
# del sys.path[1]
# sys.path

In [13]:
# Split the scaler data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val=split_train_test(df=data_cleaned,target=target,test_ratio=0.2)

In [14]:
# Import the function save_sets from sets and save the sets into the folder data/processed
from src.data.sets import save_sets
save_sets(X_train, y_train, X_val, y_val, path='../data/processed/')

In [15]:
# Import the function load_sets from sets and load the sets from data/processed
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

## Baseline Model

In [16]:
# Import statistics
from statistics import mode

In [17]:
# Find the mode of the target variable from the training set
y_mode=mode(y_train)

In [18]:
# Create a numpy array called y_base of dimensions (len(y_train), 1) filled with the mode value
y_base=np.full((len(y_train),1),y_mode)

In [19]:
# Import the function print_class_perf from models.performance and display the ROC-AUC score
from src.models.performance import print_class_perf

print_class_perf(y_train,y_base,set_name='Training')

ROC AUC Score Training: 0.5


## Classification Models Comparison

In [20]:
# Import lazypredict package
import lazypredict

In [21]:
# Import LazyClassifier from lazypredict.Supervised
from lazypredict.Supervised import LazyClassifier

In [22]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None,random_state=8)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)

100%|██████████| 29/29 [00:15<00:00,  1.84it/s]


In [23]:
# Display all the rows
pd.set_option('display.max_rows', None)

In [24]:
# Print the metrics for different classifiers
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GaussianNB,0.58,0.63,0.69,0.64,0.02
BernoulliNB,0.64,0.63,0.68,0.68,0.02
QuadraticDiscriminantAnalysis,0.77,0.6,0.67,0.77,0.04
DecisionTreeClassifier,0.75,0.56,0.56,0.76,0.09
ExtraTreeClassifier,0.74,0.55,0.55,0.75,0.02
KNeighborsClassifier,0.82,0.54,0.61,0.78,0.36
BaggingClassifier,0.81,0.54,0.62,0.78,0.46
LabelSpreading,0.75,0.54,0.63,0.75,3.83
LabelPropagation,0.75,0.53,0.62,0.75,3.21
LGBMClassifier,0.83,0.52,0.67,0.78,0.23


In [25]:
#  Import dump from joblib and save the fitted model into the folder models as a file called lazypredict_raw
from joblib import dump 

dump(clf,  '../models/lazypredict_raw.joblib')

['../models/lazypredict_raw.joblib']

## Data Preprocessing - Dropping Outliers

In [7]:
# Create a copy of data and save it into a variable data_cleaned
data_cleaned=data.copy()

In [8]:
# Remove the id column
data_cleaned.drop('Id',axis=1,inplace=True)

In [9]:
# Check the number of rows where Games played are below or equal to 0
len(data_cleaned[(data_cleaned['GP']<=0)])

2

In [10]:
# Considering the small number, remove the 2 records when the column Games played is negative
data_cleaned.drop(data_cleaned[data_cleaned['GP']<=0].index,inplace=True)

# Method 2
# data_cleaned=data_cleaned[(data_cleaned['GP']>0)]

In [11]:
# Check the number of rows where the columns of 3P Made, 3PA and 3P% are negative
print(f"There are {len(data_cleaned[data_cleaned['3P Made']<0])} rows where '3P Made' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['3PA']<0])} rows where '3PA' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['3P%']<0])} rows where '3P%' column is negative.")

There are 1628 rows where '3P Made' column is negative.
There are 1657 rows where '3PA' column is negative.
There are 878 rows where '3P%' column is negative.


In [12]:
# Considering the significant number of rows, the columns of '3P Made', '3PA' and '3P%' are removed from the dataset
data_cleaned.drop(['3P Made','3PA','3P%'],axis=1,inplace=True)

In [13]:
# Check the number of rows where FT% is negative or over 100%
print(f"There are {len(data_cleaned[data_cleaned['FT%']<0])} rows where 'FT%' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['FT%']>100])} rows where 'FT%' column is over 100%.")

There are 1 rows where 'FT%' column is negative.
There are 58 rows where 'FT%' column is over 100%.


In [14]:
# Considering the small number, remove the records that FT% is negative or over 100%
data_cleaned.drop(data_cleaned[(data_cleaned['FT%']<0)|(data_cleaned['FT%']>100)].index,inplace=True)

# Method 2
# data_cleaned=data_cleaned[(data_cleaned['FT%']>=0)&(data_cleaned['FT%']<=100)]

In [15]:
# Check the number of rows where BLK is negative
print(f"There are {len(data_cleaned[data_cleaned['BLK']<0])} rows where 'BLK' column is negative.")

There are 1029 rows where 'BLK' column is negative.


In [16]:
# Considering the significant number of rows, the column of 'BLK' is removed from the dataset
data_cleaned.drop(['BLK'],axis=1,inplace=True)

In [17]:
# Check the whether there are duplicate rows
sum(data_cleaned.duplicated())

0

In [18]:
# Display number of rows and columns after data cleansing
data_cleaned.shape

(7939, 16)

## Feature Engineering

In [19]:
# Add the columns 'TOTAL_MIN','TOTAL_PTS' and 'FG/FT'
data_cleaned['TOTAL_MIN']=data_cleaned['MIN'] * data_cleaned['GP']
data_cleaned['TOTAL_PTS']=data_cleaned['PTS'] * data_cleaned['GP']
data_cleaned['FG/FT']=data_cleaned['FG%']/data_cleaned['FT%']

In [20]:
# Display number of rows and columns after data cleansing
data_cleaned.shape

(7939, 19)

In [25]:
data_cleaned.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,TOV,TARGET_5Yrs,TOTAL_MIN,TOTAL_PTS,FG/FT
0,80,24.3,7.8,3.0,6.4,45.7,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,1.6,1,1944.0,624.0,0.633842
1,75,21.8,10.5,4.2,7.9,55.1,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,1.4,1,1635.0,787.5,0.812684
2,85,19.1,4.5,1.9,4.5,42.8,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.6,1,1623.5,382.5,0.56539
3,63,19.1,8.2,3.5,6.7,52.5,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,1.9,1,1203.3,516.6,0.784753
4,63,17.8,3.7,1.7,3.4,50.8,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.7,1,1121.4,233.1,0.940741


## Data Preparation

In [26]:
# Extract the column 'TARGET_5Yrs' and save it into variable called target
target=data_cleaned.pop('TARGET_5Yrs')

In [27]:
# Import StandardScaler from sklearn.preprocessing and instantiate the StandardScaler
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

In [28]:
# Fit and apply the scaling on data_cleaned
data_cleaned=scaler.fit_transform(data_cleaned)

In [29]:
# Import dump from joblib and save the scaler into the folder models and call the file scaler.joblib
from joblib import dump

dump(scaler, '../models/scaler_dropped_fe.joblib')

['../models/scaler_dropped_fe.joblib']

In [30]:
%load_ext autoreload
%autoreload 2

In [4]:
# Import function scaler_split_train_test from data.sets
import sys
sys.path.insert(1, '..')
from src.data.sets import split_train_test

In [32]:
# Split the scaler data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val=split_train_test(df=data_cleaned,target=target,test_ratio=0.2)

In [33]:
# Import the function save_sets from sets and save the sets into the folder data/processed
from src.data.sets import save_sets
save_sets(X_train, y_train, X_val, y_val, path='../data/processed/')

In [5]:
# Import the function load_sets from sets and load the sets from data/processed
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

## Classification Models Comparison with Cleaned Data

In [50]:
# Import lazypredict package and Import LazyClassifier from lazypredict.Supervised
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [52]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)

100%|██████████| 29/29 [00:15<00:00,  1.90it/s]


In [53]:
# Print the metrics of classifiers
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.61,0.63,0.69,0.66,0.04
BernoulliNB,0.63,0.63,0.68,0.68,0.02
GaussianNB,0.54,0.61,0.69,0.6,0.02
LabelPropagation,0.76,0.54,,0.76,2.11
KNeighborsClassifier,0.82,0.54,0.59,0.79,0.16
LabelSpreading,0.76,0.54,,0.76,4.2
ExtraTreeClassifier,0.74,0.54,0.54,0.75,0.02
DecisionTreeClassifier,0.73,0.54,0.54,0.74,0.09
XGBClassifier,0.83,0.53,0.66,0.79,0.86
LinearDiscriminantAnalysis,0.84,0.53,0.73,0.79,0.06


In [54]:
#  Import dump from joblib and save the fitted model into the folder models as a file called lazypredict_cleaned
from joblib import dump 

dump(clf,  '../models/lazypredict_dropped_fe.joblib')

['../models/lazypredict_dropped_fe.joblib']

## Treatment of Imbalanced Data

In [33]:
# Count the number of target value = 0 or 1 in y_train
pd.DataFrame(y_train).value_counts()

1    5273
0    1078
dtype: int64

### Method 1: SMOTE

In [36]:
# Import SMOTE from imblearn.over_sampling and instantiate the SMOTE class called sm
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

In [37]:
# Convert to balanced training set
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [38]:
# Count the number of target value = 0 or 1 in y_train_res
pd.DataFrame(y_train_res).value_counts()

0    5273
1    5273
dtype: int64

### Method 2: Oversampling

In [39]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)

In [40]:
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [41]:
# Count the number of target value = 0 or 1 in y_train_ros
pd.DataFrame(y_train_ros).value_counts()

0    5273
1    5273
dtype: int64

### Methond 3: Undersampling

In [12]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [28]:
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [30]:
# Count the number of target value = 0 or 1 in y_train_rus
pd.DataFrame(y_train_rus).value_counts()

0    1078
1    1078
dtype: int64

### Methond 4: Undersampling - Tomek links

In [42]:
# Import TomekLinks from imblearn.under_sampling and intantiate a TomekLinks Class
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()

In [43]:
# Undersample the train set
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [44]:
# Count the number of target value = 0 or 1 in y_train_tl
pd.DataFrame(y_train_tl).value_counts()

1    4899
0    1078
dtype: int64

## Classification Models with Balanced Data

In [25]:
# Import lazypredict package and Import LazyClassifier from lazypredict.Supervised
import lazypredict
from lazypredict.Supervised import LazyClassifier

### Training Models with SMOTE

In [47]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf_bal = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_bal.fit(X_train_res, X_val, y_train_res, y_val)

100%|██████████| 29/29 [01:23<00:00,  2.88s/it]


In [48]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.65,0.66,0.73,0.69,0.39
CalibratedClassifierCV,0.65,0.66,0.73,0.7,7.23
LinearDiscriminantAnalysis,0.65,0.66,0.73,0.7,0.21
AdaBoostClassifier,0.69,0.64,0.7,0.72,1.4
BernoulliNB,0.6,0.63,0.68,0.65,0.04
QuadraticDiscriminantAnalysis,0.51,0.62,0.69,0.57,0.06
GaussianNB,0.49,0.61,0.69,0.55,0.04
KNeighborsClassifier,0.6,0.57,0.59,0.65,0.31
ExtraTreesClassifier,0.8,0.57,0.68,0.79,2.37
RandomForestClassifier,0.81,0.56,0.66,0.79,4.57


### Training Models with Oversampling

In [9]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf_bal = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_bal.fit(X_train_ros, X_val, y_train_ros, y_val)

100%|██████████| 29/29 [01:18<00:00,  2.70s/it]


In [10]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.65,0.67,0.73,0.7,0.19
CalibratedClassifierCV,0.65,0.67,0.73,0.7,5.71
LogisticRegression,0.64,0.67,0.73,0.69,0.2
AdaBoostClassifier,0.65,0.64,0.69,0.7,1.13
BernoulliNB,0.59,0.62,0.68,0.65,0.05
LGBMClassifier,0.76,0.62,0.68,0.77,0.48
QuadraticDiscriminantAnalysis,0.47,0.61,0.69,0.53,0.06
GaussianNB,0.49,0.61,0.69,0.55,0.04
KNeighborsClassifier,0.63,0.58,0.59,0.68,0.48
XGBClassifier,0.79,0.58,0.65,0.78,1.45


### Training Models with Undersampling

In [31]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf_bal = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_bal.fit(X_train_rus, X_val, y_train_rus, y_val)

100%|██████████| 29/29 [00:07<00:00,  3.96it/s]


In [32]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.65,0.68,0.73,0.7,0.04
LogisticRegression,0.64,0.68,0.73,0.69,0.08
CalibratedClassifierCV,0.65,0.68,0.73,0.7,1.07
RandomForestClassifier,0.62,0.64,0.69,0.67,0.77
ExtraTreesClassifier,0.62,0.64,0.69,0.67,0.62
AdaBoostClassifier,0.64,0.63,0.69,0.69,0.37
LGBMClassifier,0.62,0.62,0.67,0.67,0.29
BernoulliNB,0.59,0.62,0.68,0.64,0.04
GaussianNB,0.49,0.61,0.69,0.54,0.02
QuadraticDiscriminantAnalysis,0.47,0.61,0.7,0.52,0.03


### Training Models with Undersampling - Tomek Links

In [45]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf_bal = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_bal.fit(X_train_tl, X_val, y_train_tl, y_val)

100%|██████████| 29/29 [00:24<00:00,  1.18it/s]


In [46]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.61,0.63,0.68,0.67,0.03
QuadraticDiscriminantAnalysis,0.59,0.63,0.69,0.65,0.05
GaussianNB,0.54,0.61,0.69,0.6,0.04
ExtraTreeClassifier,0.73,0.56,0.56,0.75,0.04
KNeighborsClassifier,0.82,0.56,0.6,0.79,0.24
BaggingClassifier,0.8,0.55,0.63,0.78,0.84
LabelSpreading,0.75,0.55,,0.76,6.3
LabelPropagation,0.75,0.54,,0.75,3.76
LinearDiscriminantAnalysis,0.84,0.54,0.73,0.8,0.12
XGBClassifier,0.82,0.53,0.66,0.78,1.03
