<a href="https://colab.research.google.com/github/aiswaryapaul/pca-handson/blob/main/PCA_Practical_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **1. Importing the required packages**

In [None]:
import numpy as np
import pandas as pd

#machine learning related packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

### **2. Reading and Exploring the data**

In [None]:
data = pd.read_csv('nba_final.csv')

In [None]:
data.head() #print the top 5 rows to take a quick look at the data

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,,,42.8,No


In [None]:
data.shape  #print the number of rows and columns

(1408, 45)

In [None]:
data.dtypes #print the datatype of values in each column

Unnamed: 0,0
Rk,int64
Player.x,object
Player_ID,object
Pos1,object
Pos2,object
Age,int64
Tm,object
G,int64
GS,int64
MP,float64


In [None]:
data.isnull().sum(axis = 0) #print the total number of missing values column-wise

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,1396
Age,0
Tm,0
G,0
GS,0
MP,0


In [None]:
round((data.isnull().sum(axis = 0) / len(data)) * 100, 2)

Unnamed: 0,0
Rk,0.0
Player.x,0.0
Player_ID,0.0
Pos1,0.0
Pos2,99.15
Age,0.0
Tm,0.0
G,0.0
GS,0.0
MP,0.0


In [None]:
data['Pos2'] = data['Pos2'].fillna('None')

In [None]:
data = data.fillna(0)

In [None]:
data.duplicated().sum() #print the number of duplicate rows

np.int64(0)

In [None]:
data.drop_duplicates(inplace = True)

In [None]:
data.drop(columns = ['Player.x', 'Player_ID'], inplace = True)

#### **Encoding the categorical columns with LabelEncoder**

In [None]:
obj_cols = data.select_dtypes('object').columns

In [None]:
le = LabelEncoder()

for col in obj_cols:
  data[col] = le.fit_transform(data[col])

In [None]:
#check for data imbalance
data['Play'].value_counts()

Unnamed: 0_level_0,count
Play,Unnamed: 1_level_1
0,1335
1,73


### **Machine Learning Process**

In [None]:
X = data.drop(columns = 'Play')
y = data['Play']

In [None]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

#### **Mean Centering/Scaling/Standardization of the data**

- Always perform the scaling after splitting the data into training and testing

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Apply the Logistic Regression on the mean centered data before doing the PCA transformation

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = log_reg.predict(X_test_scaled)

In [None]:
accuracy_score(y_test, y_pred)  #this should be not be trusted in the case of imbalanced data

0.975177304964539

In [None]:
roc_auc_score(y_test, y_pred)

np.float64(0.7981273408239701)

#### **To check for overfitting**

In [None]:
y_pred_train = log_reg.predict(X_train_scaled)

In [None]:
roc_auc_score(y_train, y_pred_train)

np.float64(0.9210092987214258)

#### **Now we will transform the data using PCA and check the performance of the model using the PCA transformed data**

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.85)

In [None]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
X_train_pca.shape

(1126, 13)

#### **Apply the logistic regression again on the pca transformed data**

In [None]:
log_reg2 = LogisticRegression()
log_reg2.fit(X_train_pca, y_train)

In [None]:
y_pred2 = log_reg2.predict(X_test_pca)

In [None]:
roc_auc_score(y_test, y_pred2)

np.float64(0.9)

### **Apply the Linear Discriminant Analysis(LDA) on the data**

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()

In [None]:
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

In [None]:
X_train_lda.shape

(1126, 1)

#### **Apply the Logistic Regression on the LDA transformed data**

In [None]:
log_reg3 = LogisticRegression()
log_reg3.fit(X_train_lda, y_train)

In [None]:
y_pred_3 = log_reg3.predict(X_test_lda)

In [None]:
roc_auc_score(y_test, y_pred_3)

np.float64(0.7925093632958802)