# Linear Discriminant Analysis is a Dimensionality Reduction technique similar to PCA.
- The idea of LDA is to reduce the total Features(for 2D columns) present in the data.
- LDA like PCA also reduces features by deriving new set from the available set of features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

In [2]:
data = sns.load_dataset('iris')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
data.shape

(150, 5)

In [4]:
# Seperate X and y
X = data.drop("species", axis = 1)
y = data["species"]

# Apply LDA on the train set

In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 2)
lda

In [6]:
lda_data = lda.fit_transform(X,y)
lda_data

array([[ 8.06179978e+00,  3.00420621e-01],
       [ 7.12868772e+00, -7.86660426e-01],
       [ 7.48982797e+00, -2.65384488e-01],
       [ 6.81320057e+00, -6.70631068e-01],
       [ 8.13230933e+00,  5.14462530e-01],
       [ 7.70194674e+00,  1.46172097e+00],
       [ 7.21261762e+00,  3.55836209e-01],
       [ 7.60529355e+00, -1.16338380e-02],
       [ 6.56055159e+00, -1.01516362e+00],
       [ 7.34305989e+00, -9.47319209e-01],
       [ 8.39738652e+00,  6.47363392e-01],
       [ 7.21929685e+00, -1.09646389e-01],
       [ 7.32679599e+00, -1.07298943e+00],
       [ 7.57247066e+00, -8.05464137e-01],
       [ 9.84984300e+00,  1.58593698e+00],
       [ 9.15823890e+00,  2.73759647e+00],
       [ 8.58243141e+00,  1.83448945e+00],
       [ 7.78075375e+00,  5.84339407e-01],
       [ 8.07835876e+00,  9.68580703e-01],
       [ 8.02097451e+00,  1.14050366e+00],
       [ 7.49680227e+00, -1.88377220e-01],
       [ 7.58648117e+00,  1.20797032e+00],
       [ 8.68104293e+00,  8.77590154e-01],
       [ 6.

In [7]:
lda_data = pd.DataFrame(lda_data, columns = ["LD1", "LD2"])
lda_data["species"] = y
lda_data

Unnamed: 0,LD1,LD2,species
0,8.061800,0.300421,setosa
1,7.128688,-0.786660,setosa
2,7.489828,-0.265384,setosa
3,6.813201,-0.670631,setosa
4,8.132309,0.514463,setosa
...,...,...,...
145,-5.645003,1.677717,virginica
146,-5.179565,-0.363475,virginica
147,-4.967741,0.821141,virginica
148,-5.886145,2.345091,virginica


# Seperate X and y

In [8]:
X = lda_data.drop("species", axis = 1)
y = lda_data["species"]

In [9]:
# Split the data into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [10]:
# apply Logistic Regression on the train set

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [11]:
lr.fit(X_train, y_train)

In [12]:
y_pred = lr.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.9736842105263158

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr, X, y, cv = 10)

array([1.        , 1.        , 1.        , 1.        , 0.93333333,
       1.        , 0.86666667, 1.        , 1.        , 1.        ])

# Difference between PCA & LDA 
- PCA is Unsupervised
- LDA is Supervised