## Dimensionality Reduction:

#### PCA - Principal Component Analysis
#### LDA - LinearDiscriminantAnalysis

- It means reducing the dimenision of the data
- Which means that we are reducing the number of columns present in the data
- Work: Derive new set of features(m) out of the original features present in the data(n)
- m < n
- Accuracy is not compromised

## Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
data = pd.read_csv('Iris.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data.drop('Id', axis = 1, inplace = True)
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Create X and y

In [4]:
X = data.drop('Species', axis = 1)
y = data['Species']

## Feature Scaling 

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [6]:
ss.fit_transform(X)

array([[-9.00681170e-01,  1.03205722e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00, -1.24957601e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.38535265e+00,  3.37848329e-01, -1.39813811e+00,
        -1.31297673e+00],
       [-1.50652052e+00,  1.06445364e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.02184904e+00,  1.26346019e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-5.37177559e-01,  1.95766909e+00, -1.17067529e+00,
        -1.05003079e+00],
       [-1.50652052e+00,  8.00654259e-01, -1.34127240e+00,
        -1.18150376e+00],
       [-1.02184904e+00,  8.00654259e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.74885626e+00, -3.56360566e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00,  1.06445364e-01, -1.28440670e+00,
        -1.44444970e+00],
       [-5.37177559e-01,  1.49486315e+00, -1.28440670e+00,
        -1.31297673e+00],
       [-1.26418478e+00,  8.00654259e-01, -1.22754100e+00,
      

## Encoding y

In [8]:
data['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [10]:
dic = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}
y = y.replace(dic)
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64

## Applying principal component analysis

In [31]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)

In [32]:
res = pca.fit_transform(X)
res

array([[-2.68420713,  0.32660731],
       [-2.71539062, -0.16955685],
       [-2.88981954, -0.13734561],
       [-2.7464372 , -0.31112432],
       [-2.72859298,  0.33392456],
       [-2.27989736,  0.74778271],
       [-2.82089068, -0.08210451],
       [-2.62648199,  0.17040535],
       [-2.88795857, -0.57079803],
       [-2.67384469, -0.1066917 ],
       [-2.50652679,  0.65193501],
       [-2.61314272,  0.02152063],
       [-2.78743398, -0.22774019],
       [-3.22520045, -0.50327991],
       [-2.64354322,  1.1861949 ],
       [-2.38386932,  1.34475434],
       [-2.6225262 ,  0.81808967],
       [-2.64832273,  0.31913667],
       [-2.19907796,  0.87924409],
       [-2.58734619,  0.52047364],
       [-2.3105317 ,  0.39786782],
       [-2.54323491,  0.44003175],
       [-3.21585769,  0.14161557],
       [-2.30312854,  0.10552268],
       [-2.35617109, -0.03120959],
       [-2.50791723, -0.13905634],
       [-2.469056  ,  0.13788731],
       [-2.56239095,  0.37468456],
       [-2.63982127,

## Converting the output to dataframe

In [33]:
pca_data = pd.DataFrame(res, columns = ['PC1', 'PC2'])
pca_data['Species'] = y
pca_data

Unnamed: 0,PC1,PC2,Species
0,-2.684207,0.326607,0
1,-2.715391,-0.169557,0
2,-2.889820,-0.137346,0
3,-2.746437,-0.311124,0
4,-2.728593,0.333925,0
...,...,...,...
145,1.944017,0.187415,2
146,1.525664,-0.375021,2
147,1.764046,0.078519,2
148,1.901629,0.115877,2


In [22]:
pca.explained_variance_

array([4.22484077, 0.24224357, 0.07852391])

## Apply Logistic Regression on the Original Data

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [25]:
lr.fit(X_train, y_train)

In [28]:
y_pred = lr.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

## Apply Logistic Regression on PCA data

In [34]:
pca_data

Unnamed: 0,PC1,PC2,Species
0,-2.684207,0.326607,0
1,-2.715391,-0.169557,0
2,-2.889820,-0.137346,0
3,-2.746437,-0.311124,0
4,-2.728593,0.333925,0
...,...,...,...
145,1.944017,0.187415,2
146,1.525664,-0.375021,2
147,1.764046,0.078519,2
148,1.901629,0.115877,2


In [35]:
X = pca_data.drop('Species', axis = 1)
y = pca_data['Species']

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [37]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [38]:
lr.fit(X_train, y_train)

In [39]:
y_pred_pca = lr.predict(X_test)
y_pred_pca

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0], dtype=int64)

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_pca)

1.0

### Difference between PCA and LDA

- PCA when deriving features considers only independent variables
- LDA when deriving features consider both independent variable and target

In [41]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [42]:
data['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [46]:
data[data['Species'] == 'Iris-setosa']

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [47]:
len(data[data['Species'] == 'Iris-setosa'])

50

## Create X and y

In [49]:
X = data.drop('Species', axis = 1)
y = data['Species']

## Feature Scaling 

In [50]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [51]:
ss.fit_transform(X)

array([[-9.00681170e-01,  1.03205722e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00, -1.24957601e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.38535265e+00,  3.37848329e-01, -1.39813811e+00,
        -1.31297673e+00],
       [-1.50652052e+00,  1.06445364e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.02184904e+00,  1.26346019e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-5.37177559e-01,  1.95766909e+00, -1.17067529e+00,
        -1.05003079e+00],
       [-1.50652052e+00,  8.00654259e-01, -1.34127240e+00,
        -1.18150376e+00],
       [-1.02184904e+00,  8.00654259e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.74885626e+00, -3.56360566e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00,  1.06445364e-01, -1.28440670e+00,
        -1.44444970e+00],
       [-5.37177559e-01,  1.49486315e+00, -1.28440670e+00,
        -1.31297673e+00],
       [-1.26418478e+00,  8.00654259e-01, -1.22754100e+00,
      

## Encoding y

In [52]:
data['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [53]:
dic = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}
y = y.replace(dic)
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64

## Apply LDA on the data

In [54]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 2)

In [56]:
res_lda = lda.fit_transform(X, y)
res_lda

array([[ 8.0849532 ,  0.32845422],
       [ 7.1471629 , -0.75547326],
       [ 7.51137789, -0.23807832],
       [ 6.83767561, -0.64288476],
       [ 8.15781367,  0.54063935],
       [ 7.72363087,  1.48232345],
       [ 7.23514662,  0.3771537 ],
       [ 7.62974497,  0.01667246],
       [ 6.58274132, -0.98737424],
       [ 7.36884116, -0.91362729],
       [ 8.42181434,  0.67622968],
       [ 7.24739721, -0.08292417],
       [ 7.35062105, -1.0393597 ],
       [ 7.59646896, -0.77671553],
       [ 9.86936588,  1.61486093],
       [ 9.18033614,  2.75558626],
       [ 8.59760709,  1.85442217],
       [ 7.7995682 ,  0.60905468],
       [ 8.1000091 ,  0.99610981],
       [ 8.04543611,  1.16244332],
       [ 7.52046427, -0.156233  ],
       [ 7.60526378,  1.22757267],
       [ 8.70408249,  0.89959416],
       [ 6.26374139,  0.46023935],
       [ 6.59191505, -0.36199821],
       [ 6.79210164, -0.93823664],
       [ 6.84048091,  0.4848487 ],
       [ 7.948386  ,  0.23871551],
       [ 8.01209273,

In [58]:
lda_data = pd.DataFrame(res_lda, columns = ['LD1', 'LD2'])
lda_data['Species'] = y
lda_data

Unnamed: 0,LD1,LD2,Species
0,8.084953,0.328454,0
1,7.147163,-0.755473,0
2,7.511378,-0.238078,0
3,6.837676,-0.642885,0
4,8.157814,0.540639,0
...,...,...,...
145,-5.674013,1.661346,2
146,-5.197129,-0.365506,2
147,-4.981712,0.812973,2
148,-5.901486,2.320751,2


## Apply logisticRegression on lda_data

In [59]:
X_lda = lda_data.drop('Species', axis = 1)
y_lda = lda_data['Species']

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_lda, y_lda, test_size = 0.25, random_state= 42)

In [62]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [63]:
lr.fit(X_train, y_train)

In [64]:
y_pred_lda = lr.predict(X_test)

In [65]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_lda)

1.0