# Fisher's Linear Discriminant Analysis

## Importing Libraries and Data Handling

In [1]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
import warnings
from feature_engineering import *
warnings.filterwarnings('ignore')
PI = np.pi
%matplotlib inline

In [2]:
# assume X_train and y_train are the training data and labels respectively
df = pd.read_csv("../data/dataset1.csv")
df.head(-1)
df=df.drop('id',axis=1)
mean = np.mean(df, axis=0)
for i in range(1, df.shape[1]):
    df.iloc[:, i].fillna(mean[i-1], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df, shuffle = False)

## Task 1: FLDM on raw data (FLDM1)

In [3]:
# build the FLDM model
fldm1 = LinearDiscriminantAnalysis(n_components=1)
fldm1.fit(X_train, y_train)

# project the training data onto the 1-dimensional FLDM space
X_train_lda = fldm1.transform(X_train)

In [4]:
# find the decision boundary in the 1-dimensional FLDM space
mean_pos = np.mean(X_train_lda[y_train == 1])
mean_neg = np.mean(X_train_lda[y_train == -1])
std_pos = np.std(X_train_lda[y_train == 1])
std_neg = np.std(X_train_lda[y_train == -1])

threshold = (mean_pos + mean_neg) / 2

# project the testing data onto the 1-dimensional FLDM space
X_test_lda = fldm1.transform(X_test)

# evaluate the performance of the model on the testing data
y_pred = np.where(X_test_lda > threshold, 1, -1)
accuracy_fldm1 = evaluate(y_test, y_pred)

Confusion Matrix: {'true_positive': 41, 'true_negative': 142, 'false_positive': 3, 'false_negative': 2}
Accuracy: 97.34042553191489%
Precision: 93.18181818181819%
Recall: 95.34883720930233%


## Task 2: FLDM on data with shuffled columns (FLDM2)

In [8]:
# randomly shuffle the order of features in the training data
np.random.seed(42)
n_features = X_train.shape[1]
feature_order = np.random.permutation(n_features)
X_train_shuffled = X_train[X_train.columns[feature_order]]
X_train_shuffled.head()

Unnamed: 0,concave points_worst,compactness_se,area_worst,concave points_se,symmetry_mean,fractal_dimension_mean,symmetry_worst,smoothness_worst,perimeter_se,radius_mean,...,texture_worst,concavity_worst,symmetry_se,fractal_dimension_worst,radius_worst,concave points_mean,radius_se,smoothness_se,fractal_dimension_se,concavity_mean
0,0.2654,0.04904,2019.0,0.01587,0.2419,0.07871,0.4601,0.1622,8.589,17.99,...,17.33,0.7119,0.03003,0.1189,25.38,0.1471,1.095,0.006399,0.006193,0.3001
1,0.186,0.01308,1956.0,0.0134,0.1812,0.05667,0.275,0.1238,3.398,20.57,...,23.41,0.2416,0.01389,0.08902,24.99,0.07017,0.5435,0.005225,0.003532,0.0869
2,0.243,0.04006,1709.0,0.02058,0.2069,0.05999,0.3613,0.1444,4.585,19.69,...,25.53,0.4504,0.0225,0.08758,23.57,0.1279,0.7456,0.00615,0.004571,0.1974
3,0.2575,0.07458,567.7,0.01867,0.2597,0.09744,0.6638,0.2098,3.445,11.42,...,26.5,0.6869,0.05963,0.173,14.91,0.1052,0.4956,0.00911,0.009208,0.2414
4,0.1625,0.02461,1575.0,0.01885,0.1809,0.05883,0.2364,0.1374,5.438,20.29,...,16.67,0.4,0.01756,0.07678,22.54,0.1043,0.7572,0.01149,0.005115,0.198


In [6]:
# build the FLDM model
fldm2 = LinearDiscriminantAnalysis(n_components=1)
fldm2.fit(X_train_shuffled, y_train)

# project the training data onto the 1-dimensional FLDM space
X_train_lda = fldm2.transform(X_train_shuffled)

In [7]:
# find the decision boundary in the 1-dimensional FLDM space
mean_pos = np.mean(X_train_lda[y_train == 1])
mean_neg = np.mean(X_train_lda[y_train == -1])
std_pos = np.std(X_train_lda[y_train == 1])
std_neg = np.std(X_train_lda[y_train == -1])

threshold = (mean_pos + mean_neg) / 2

# shuffle the order of features in the testing data
X_test_shuffled = X_test[X_test.columns[feature_order]]

# project the testing data onto the 1-dimensional FLDM space
X_test_lda = fldm2.transform(X_test_shuffled)

# evaluate the performance of the model on the testing data
y_pred = np.where(X_test_lda > threshold, 1, -1)
accuracy2 = evaluate(y_test, y_pred)

Confusion Matrix: {'true_positive': 41, 'true_negative': 142, 'false_positive': 3, 'false_negative': 2}
Accuracy: 97.34042553191489%
Precision: 93.18181818181819%
Recall: 95.34883720930233%
