Importing the libraries

In [1]:
import numpy as np
import pandas as pd

#plotting
import matplotlib.pyplot as plt

#Normlaisation
from sklearn.preprocessing import MinMaxScaler

#cross validation
from sklearn.model_selection import KFold

#matrices
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

# different models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

In [2]:
# Load dataset
#csv file in the same folder
df = pd.read_csv("nba_logreg.csv") 

Before we jump into model construction and any other things lets spend some time working on first-hand EDA

In [3]:
#looking at top 3 entries to get an idea of dataframe
df.head(3)

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0


In [4]:
#the shape of entire Dataframe
df.shape

(1340, 21)

Lets see if we have any duplicate rows in our dataframe

In [5]:
duplicate_rows = df[df.duplicated()]
print("no of duplicate rows:", duplicate_rows.shape[0])

no of duplicate rows: 12


In [6]:
all_duplicate_rows = df[df.duplicated(keep=False)] #keep=False, for fetching all the duplicate rows

In [7]:
all_duplicate_rows

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
162,Charles Smith,60,8.7,2.9,1.0,2.2,44.4,0.0,0.1,0.0,...,1.3,69.7,0.2,0.9,1.2,1.7,0.6,0.1,0.6,1.0
163,Charles Smith,60,8.7,2.9,1.0,2.2,44.4,0.0,0.1,0.0,...,1.3,69.7,0.2,0.9,1.2,1.7,0.6,0.1,0.6,1.0
165,Charles Smith,71,30.4,16.3,6.1,12.4,49.5,0.0,0.0,0.0,...,5.5,72.5,2.4,4.1,6.5,1.5,1.0,1.3,2.1,1.0
166,Charles Smith,71,30.4,16.3,6.1,12.4,49.5,0.0,0.0,0.0,...,5.5,72.5,2.4,4.1,6.5,1.5,1.0,1.3,2.1,1.0
168,Charles Smith,34,8.6,3.5,1.4,3.7,39.2,0.4,1.4,31.9,...,0.3,54.5,0.4,0.4,0.8,0.6,0.3,0.2,0.8,1.0
169,Charles Smith,34,8.6,3.5,1.4,3.7,39.2,0.4,1.4,31.9,...,0.3,54.5,0.4,0.4,0.8,0.6,0.3,0.2,0.8,1.0
242,Reggie Williams,35,24.5,10.4,4.3,12.2,35.6,0.4,1.7,22.4,...,1.9,72.7,1.6,1.8,3.4,1.7,0.8,0.6,1.8,1.0
243,Reggie Williams,35,24.5,10.4,4.3,12.2,35.6,0.4,1.7,22.4,...,1.9,72.7,1.6,1.8,3.4,1.7,0.8,0.6,1.8,1.0
338,Ken Johnson,64,12.7,4.1,1.8,3.3,52.8,0.0,0.0,,...,1.3,43.5,1.4,2.4,3.8,0.3,0.2,0.3,0.9,0.0
339,Ken Johnson,64,12.7,4.1,1.8,3.3,52.8,0.0,0.0,,...,1.3,43.5,1.4,2.4,3.8,0.3,0.2,0.3,0.9,0.0


dropping the duplicate rows now

In [8]:
df = df.drop_duplicates()

Lets get more info about the dataframes like number of null values, type of column, number of categorical values..etc

In [9]:
df.shape

(1328, 21)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1328 entries, 0 to 1339
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         1328 non-null   object 
 1   GP           1328 non-null   int64  
 2   MIN          1328 non-null   float64
 3   PTS          1328 non-null   float64
 4   FGM          1328 non-null   float64
 5   FGA          1328 non-null   float64
 6   FG%          1328 non-null   float64
 7   3P Made      1328 non-null   float64
 8   3PA          1328 non-null   float64
 9   3P%          1318 non-null   float64
 10  FTM          1328 non-null   float64
 11  FTA          1328 non-null   float64
 12  FT%          1328 non-null   float64
 13  OREB         1328 non-null   float64
 14  DREB         1328 non-null   float64
 15  REB          1328 non-null   float64
 16  AST          1328 non-null   float64
 17  STL          1328 non-null   float64
 18  BLK          1328 non-null   float64
 19  TOV   

- No categorical values: No need for any hot encoding therfore
- Missing values: 10 for **3P%**  

### Fix missing values: There are lots of things that can be done depending on the complexity of the project  
- Replacing the missing values by 0
- Replacing them by mean of the column
- Building a regression model and predicting the value
- Finding the column value of the nearest neighbor in multidimensional space and considering its value. 

Here, we will simply replace the missing values by the mean of the column to svoid complexity

In [11]:
#find the mean of the column and fill the average at all the NaN places
mean_value=df['3P%'].mean()
df['3P%']=df['3P%'].fillna(mean_value)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1328 entries, 0 to 1339
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         1328 non-null   object 
 1   GP           1328 non-null   int64  
 2   MIN          1328 non-null   float64
 3   PTS          1328 non-null   float64
 4   FGM          1328 non-null   float64
 5   FGA          1328 non-null   float64
 6   FG%          1328 non-null   float64
 7   3P Made      1328 non-null   float64
 8   3PA          1328 non-null   float64
 9   3P%          1328 non-null   float64
 10  FTM          1328 non-null   float64
 11  FTA          1328 non-null   float64
 12  FT%          1328 non-null   float64
 13  OREB         1328 non-null   float64
 14  DREB         1328 non-null   float64
 15  REB          1328 non-null   float64
 16  AST          1328 non-null   float64
 17  STL          1328 non-null   float64
 18  BLK          1328 non-null   float64
 19  TOV   

The dataframe seems ideal now, and it shouldnt require any further processing  
We can now drop the **'Name'** column as it wouldnt add any value in analysis or modeling 

In [13]:
df_final = df.drop(['Name'],axis=1)

In [14]:
#final dataset
df_final.head(2)

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0


The EDA step would be incomplete without some graphical observations of the features and how they relate to the labels