# Data Preprocessing - Dropping Outliers

##  Load and explore the dataset

In [21]:
# Load the pandas and numpy packages
import pandas as pd
import numpy as np

In [22]:
# Import csv file and save into data
data=pd.read_csv('../data/raw/2022_train.csv')

In [23]:
# Display the first 5 rows of data and all the columns
pd.set_option('max_columns', None)
data.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [24]:
# Display the summary of columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV   

Based on the summary above, none of the columns in dataframe has missing values.

In [25]:
# Display the dimensions(shape) of data
data.shape

(8000, 21)

In [26]:
# Display the descriptive statistics
data.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,7798.5,62.78,18.58,7.27,2.81,6.23,44.61,0.26,0.82,19.58,1.39,1.95,71.37,1.08,2.17,3.25,1.62,0.65,0.25,1.26,0.83
std,2309.55,17.12,8.94,4.32,1.69,3.58,6.16,0.38,1.06,16.0,0.93,1.25,10.43,0.79,1.39,2.09,1.36,0.41,0.82,0.72,0.37
min,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,-38.5,0.0,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,8.4,0.7,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,19.5,1.2,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,30.6,1.9,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,82.1,8.1,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


**Unreasonable data based on descriptive summary**
- Games played cannot be negative
- 3P, 3PA and 3P% Made cannot be negative
- FT% can not be negative and cannot be over 100%
- BLK can not be negative

##  Split Train and Test Sets for Raw Data

In [27]:
# Create a copy of data and save it into a variable data_cleaned
data_cleaned=data.copy()

In [28]:
# Remove the id column
data_cleaned.drop('Id',axis=1,inplace=True)

In [29]:
# Extract the column 'TARGET_5Yrs' and save it into variable called target
target=data_cleaned.pop('TARGET_5Yrs')

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# Import function scaler_split_train_test from data.sets
import sys
sys.path.insert(1, '..')
from src.data.sets import split_train_test

In [32]:
# del sys.path[1]
# sys.path

In [33]:
# Split the scaler data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val=split_train_test(df=data_cleaned,target=target,test_ratio=0.2)

In [34]:
# Import the function save_sets from sets and save the sets into the folder data/processed
from src.data.sets import save_sets
save_sets(X_train, y_train, X_val, y_val, path='../data/processed/')

In [35]:
# Import the function load_sets from sets and load the sets from data/processed
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

## Baseline Model

In [15]:
# Import statistics
from statistics import mode

In [16]:
# Find the mode of the target variable from the training set
y_mode=mode(y_train)

In [17]:
# Create a numpy array called y_base of dimensions (len(y_train), 1) filled with the mode value
y_base=np.full((len(y_train),1),y_mode)

In [18]:
# Import the function print_class_perf from models.performance and display the ROC-AUC score
from src.models.performance import print_class_perf

print_class_perf(y_train,y_base,set_name='Training')

ROC AUC Score Training: 0.5


## Basic Classification Models

In [40]:
# Import lazypredict package
import lazypredict

In [41]:
# Import LazyClassifier from lazypredict.Supervised
from lazypredict.Supervised import LazyClassifier

In [42]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None,random_state=8)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)

100%|██████████| 29/29 [00:24<00:00,  1.20it/s]


In [19]:
# Display all the rows
pd.set_option('display.max_rows', None)

In [43]:
# Print the metrics for different classifiers
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GaussianNB,0.58,0.63,0.69,0.64,0.02
BernoulliNB,0.64,0.63,0.68,0.68,0.25
QuadraticDiscriminantAnalysis,0.77,0.6,0.67,0.77,0.03
DecisionTreeClassifier,0.75,0.56,0.56,0.76,0.13
ExtraTreeClassifier,0.74,0.55,0.55,0.75,0.03
KNeighborsClassifier,0.82,0.54,0.61,0.78,0.21
BaggingClassifier,0.81,0.54,0.62,0.78,1.87
LabelSpreading,0.75,0.54,0.63,0.75,5.4
LabelPropagation,0.75,0.53,0.62,0.75,2.89
LGBMClassifier,0.83,0.52,0.67,0.78,0.29


In [44]:
#  Import dump from joblib and save the fitted model into the folder models as a file called lazypredict_raw
from joblib import dump 

dump(clf,  '../models/lazypredict_raw.joblib')

['../models/lazypredict_raw.joblib']

## Data Preprocessing - Dropping Outliers

In [45]:
# Create a copy of data and save it into a variable data_cleaned
data_cleaned=data.copy()

In [46]:
# Remove the id column
data_cleaned.drop('Id',axis=1,inplace=True)

In [47]:
# Check the number of rows where Games played are below or equal to 0
len(data_cleaned[(data_cleaned['GP']<=0)])

2

In [48]:
# Considering the small number, remove the 2 records when the column Games played is negative
data_cleaned.drop(data_cleaned[data_cleaned['GP']<=0].index,inplace=True)

# Method 2
# data_cleaned=data_cleaned[(data_cleaned['GP']>0)]

In [49]:
# Check the number of rows where the columns of 3P Made, 3PA and 3P% are negative
print(f"There are {len(data_cleaned[data_cleaned['3P Made']<0])} rows where '3P Made' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['3PA']<0])} rows where '3PA' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['3P%']<0])} rows where '3P%' column is negative.")

There are 1628 rows where '3P Made' column is negative.
There are 1657 rows where '3PA' column is negative.
There are 878 rows where '3P%' column is negative.


In [50]:
# Considering the significant number of rows, the columns of '3P Made', '3PA' and '3P%' are removed from the dataset
data_cleaned.drop(['3P Made','3PA','3P%'],axis=1,inplace=True)

In [51]:
# Check the number of rows where FT% is negative or over 100%
print(f"There are {len(data_cleaned[data_cleaned['FT%']<0])} rows where 'FT%' column is negative.")
print(f"There are {len(data_cleaned[data_cleaned['FT%']>100])} rows where 'FT%' column is over 100%.")

There are 1 rows where 'FT%' column is negative.
There are 58 rows where 'FT%' column is over 100%.


In [52]:
# Considering the small number, remove the records that FT% is negative or over 100%
data_cleaned.drop(data_cleaned[(data_cleaned['FT%']<0)|(data_cleaned['FT%']>100)].index,inplace=True)

# Method 2
# data_cleaned=data_cleaned[(data_cleaned['FT%']>=0)&(data_cleaned['FT%']<=100)]

In [53]:
# Check the number of rows where BLK is negative
print(f"There are {len(data_cleaned[data_cleaned['BLK']<0])} rows where 'BLK' column is negative.")

There are 1029 rows where 'BLK' column is negative.


In [54]:
# Considering the significant number of rows, the column of 'BLK' is removed from the dataset
data_cleaned.drop(['BLK'],axis=1,inplace=True)

In [55]:
# Check the whether there are duplicate rows
sum(data_cleaned.duplicated())

0

In [56]:
# Display number of rows and columns after data cleansing
data_cleaned.shape

(7939, 16)

In [57]:
# Extract the column 'TARGET_5Yrs' and save it into variable called target
target=data_cleaned.pop('TARGET_5Yrs')

In [58]:
# Import StandardScaler from sklearn.preprocessing and instantiate the StandardScaler
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

In [60]:
# Fit and apply the scaling on data_cleaned
data_cleaned=scaler.fit_transform(data_cleaned)

In [61]:
# Import dump from joblib and save the scaler into the folder models and call the file scaler.joblib
from joblib import dump

dump(scaler, '../models/scaler.joblib')

In [63]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
# Import function scaler_split_train_test from data.sets
import sys
sys.path.insert(1, '..')
from src.data.sets import split_train_test

In [65]:
# Split the scaler data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val=split_train_test(df=data_cleaned,target=target,test_ratio=0.2)

In [66]:
# Import the function save_sets from sets and save the sets into the folder data/processed
from src.data.sets import save_sets
save_sets(X_train, y_train, X_val, y_val, path='../data/processed/')

In [67]:
# Import the function load_sets from sets and load the sets from data/processed
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

## Basic Classification Models with Cleaned Data

In [68]:
# Import lazypredict package
import lazypredict

In [69]:
# Import LazyClassifier from lazypredict.Supervised
from lazypredict.Supervised import LazyClassifier

In [70]:
# Fit the Lazyclassifier model based on train set and predict the validation set
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)

100%|██████████| 29/29 [00:12<00:00,  2.27it/s]


In [71]:
# Print the metrics of classifiers
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.74,0.63,0.69,0.76,0.03
GaussianNB,0.57,0.62,0.69,0.63,0.02
BernoulliNB,0.65,0.62,0.68,0.69,0.02
ExtraTreeClassifier,0.75,0.55,0.55,0.75,0.02
LabelPropagation,0.77,0.54,,0.76,2.01
KNeighborsClassifier,0.82,0.54,0.6,0.79,0.31
LabelSpreading,0.77,0.54,,0.76,2.83
BaggingClassifier,0.81,0.54,0.62,0.78,0.38
DecisionTreeClassifier,0.73,0.53,0.53,0.74,0.07
XGBClassifier,0.82,0.52,0.66,0.78,0.39


In [72]:
#  Import dump from joblib and save the fitted model into the folder models as a file called lazypredict_cleaned
from joblib import dump 

dump(clf,  '../models/lazypredict_cleaned.joblib')

['../models/lazypredict_cleaned.joblib']