In [29]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
import warnings
from feature_engineering import *
warnings.filterwarnings('ignore')
%matplotlib inline

In [30]:
# assume X_train and y_train are the training data and labels respectively
df = pd.read_csv("../data/dataset1.csv")
df.head(-1)
df=df.drop('id',axis=1)
df_imputed = df.copy()
mean = np.mean(df_imputed, axis=0)
for i in range(1, df_imputed.shape[1]):
    df_imputed.iloc[:, i].fillna(mean[i-1], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df_imputed, shuffle = False)

In [31]:
# build the FLDM model
fldm = LinearDiscriminantAnalysis(n_components=1)
fldm.fit(X_train, y_train)

# project the training data onto the 1-dimensional FLDM space
X_train_lda = fldm.transform(X_train)

In [32]:
# find the decision boundary in the 1-dimensional FLDM space
mean_pos = np.mean(X_train_lda[y_train == 1])
mean_neg = np.mean(X_train_lda[y_train == -1])
std_pos = np.std(X_train_lda[y_train == 1])
std_neg = np.std(X_train_lda[y_train == -1])

threshold = (mean_pos + mean_neg) / 2

# assume X_test and y_test are the testing data and labels respectively

In [33]:
# project the testing data onto the 1-dimensional FLDM space

X_test_lda = fldm.transform(X_test)

# evaluate the performance of the model on the testing data
y_pred = np.where(X_test_lda > threshold, 1, -1)
evaluate(y_test, y_pred)

Confusion Matrix: {'true_positive': 41, 'true_negative': 142, 'false_positive': 3, 'false_negative': 2}
Accuracy: 97.34042553191489%
Precision: 93.18181818181819%
Recall: 95.34883720930233%


## Task 2

In [38]:
X_train

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.38,17.33,184.60,2019.0,0.1622,0.6656,0.71190,0.26540,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.99,23.41,158.80,1956.0,0.1238,0.1866,0.24160,0.18600,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.57,25.53,152.50,1709.0,0.1444,0.4245,0.45040,0.24300,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.91,26.50,98.87,567.7,0.2098,0.8663,0.68690,0.25750,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.54,16.67,152.20,1575.0,0.1374,0.2050,0.40000,0.16250,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,10.57,20.22,70.15,338.3,0.09073,0.16600,0.22800,0.05941,0.2188,0.08450,...,10.85,22.82,76.51,351.9,0.1143,0.3619,0.60300,0.14650,0.2597,0.12000
377,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,0.1421,0.05763,...,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061
378,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,...,14.54,19.64,97.96,657.0,0.1275,0.3104,0.25690,0.10540,0.3387,0.09638
379,11.08,18.83,73.30,361.6,0.12160,0.21540,0.16890,0.06367,0.2196,0.07950,...,13.24,32.82,91.76,508.1,0.2184,0.9379,0.84020,0.25240,0.4154,0.14030


In [39]:
# randomly shuffle the order of features in the training data
np.random.seed(42)
n_features = X_train.shape[1]
feature_order = np.random.permutation(n_features)
X_train_shuffled = X_train[X_train.columns[feature_order]]
X_train_shuffled

Unnamed: 0,concave points_worst,compactness_se,area_worst,concave points_se,symmetry_mean,fractal_dimension_mean,symmetry_worst,smoothness_worst,perimeter_se,radius_mean,...,texture_worst,concavity_worst,symmetry_se,fractal_dimension_worst,radius_worst,concave points_mean,radius_se,smoothness_se,fractal_dimension_se,concavity_mean
0,0.26540,0.04904,2019.0,0.015870,0.2419,0.07871,0.4601,0.1622,8.589,17.99,...,17.33,0.71190,0.03003,0.11890,25.38,0.14710,1.0950,0.006399,0.006193,0.30010
1,0.18600,0.01308,1956.0,0.013400,0.1812,0.05667,0.2750,0.1238,3.398,20.57,...,23.41,0.24160,0.01389,0.08902,24.99,0.07017,0.5435,0.005225,0.003532,0.08690
2,0.24300,0.04006,1709.0,0.020580,0.2069,0.05999,0.3613,0.1444,4.585,19.69,...,25.53,0.45040,0.02250,0.08758,23.57,0.12790,0.7456,0.006150,0.004571,0.19740
3,0.25750,0.07458,567.7,0.018670,0.2597,0.09744,0.6638,0.2098,3.445,11.42,...,26.50,0.68690,0.05963,0.17300,14.91,0.10520,0.4956,0.009110,0.009208,0.24140
4,0.16250,0.02461,1575.0,0.018850,0.1809,0.05883,0.2364,0.1374,5.438,20.29,...,16.67,0.40000,0.01756,0.07678,22.54,0.10430,0.7572,0.011490,0.005115,0.19800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,0.14650,0.07643,351.9,0.029190,0.2188,0.08450,0.2597,0.1143,2.363,10.57,...,22.82,0.60300,0.01617,0.12000,10.85,0.05941,0.1115,0.008499,0.012200,0.22800
377,0.05781,0.01203,680.6,0.005179,0.1421,0.05763,0.2694,0.1108,1.400,13.46,...,35.63,0.07934,0.01442,0.07061,14.69,0.01117,0.1689,0.004942,0.001684,0.01271
378,0.10540,0.02984,657.0,0.008356,0.1792,0.05897,0.3387,0.1275,1.101,13.66,...,19.64,0.25690,0.01818,0.09638,14.54,0.02471,0.1402,0.005212,0.004868,0.04249
379,0.25240,0.04549,508.1,0.013390,0.2196,0.07950,0.4154,0.2184,1.719,11.08,...,32.82,0.84020,0.01738,0.14030,13.24,0.06367,0.2114,0.007405,0.004435,0.16890


In [40]:
# build the FLDM model
fldm = LinearDiscriminantAnalysis(n_components=1)
fldm.fit(X_train_shuffled, y_train)

# project the training data onto the 1-dimensional FLDM space
X_train_lda = fldm.transform(X_train_shuffled)

In [41]:
# find the decision boundary in the 1-dimensional FLDM space
mean_pos = np.mean(X_train_lda[y_train == 1])
mean_neg = np.mean(X_train_lda[y_train == -1])
std_pos = np.std(X_train_lda[y_train == 1])
std_neg = np.std(X_train_lda[y_train == -1])

threshold = (mean_pos + mean_neg) / 2

# shuffle the order of features in the testing data
X_test_shuffled = X_test[X_test.columns[feature_order]]

# project the testing data onto the 1-dimensional FLDM space
X_test_lda = fldm.transform(X_test_shuffled)

# evaluate the performance of the model on the testing data
y_pred = np.where(X_test_lda > threshold, 1, -1)
evaluate(y_test, y_pred)

Confusion Matrix: {'true_positive': 41, 'true_negative': 142, 'false_positive': 3, 'false_negative': 2}
Accuracy: 97.34042553191489%
Precision: 93.18181818181819%
Recall: 95.34883720930233%
