In [1]:
import itertools
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
# '\s+' tells pandas to use any spaces as the delimiter instead of ,
train_df = pd.read_csv("../data/train.2024.04.16.csv")
feature_col_names = np.array(['F01', 'F02', 'F03', 'F04',
       'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 'F13'])

In [3]:
all_feats_df = train_df[~np.any(np.isnan(train_df[feature_col_names]), axis=1)]
X = all_feats_df[feature_col_names].to_numpy()
y = all_feats_df["y"].to_numpy()

In [4]:
print(X.shape)
q1, q3  = np.percentile(X, [25, 75], axis=0)
IQR = q3 - q1
lower_fence = q1 - IQR*3.0
upper_fence = q3 + IQR*3.0
drop_inds = np.unique(np.concatenate([np.unique(np.where(X < lower_fence)[0]), 
                                      np.unique(np.where(X > upper_fence)[0])]))
keep_inds = np.delete(np.arange(0, X.shape[0]), drop_inds)
X = X[keep_inds, :]
y = y[keep_inds]
print(X.shape)

(757, 13)
(752, 13)


In [5]:
model = LDA()
pipeline = Pipeline([('scaler', StandardScaler()), ('m', model)])
scoring_method ='f1_macro'
larger_score_is_better = True
cv_random_state=2652124
n_jobs=10

In [6]:
cv_n_splits = 10
cv_n_repeats = 2
cv_outer = RepeatedStratifiedKFold(n_splits=cv_n_splits, 
                        n_repeats=cv_n_repeats, 
                        random_state=cv_random_state)

> In general, the model would require CP + P (P + 1)/2 parameters with P predictors and C classes - Kuhn and Johnson, p. 292

Means -> P means for each C  
Covariance -> symmetric PxP matrix => $(P^2 + P)/2$ unique elements

In [7]:
C = 3
P = np.arange(1, 14)
n_params = C*P + P*(P+1)/2
n_params

array([  4.,   9.,  15.,  22.,  30.,  39.,  49.,  60.,  72.,  85.,  99.,
       114., 130.])

In [10]:
n_feats = X.shape[1]
n_combs = np.sum([math.comb(n_feats, i) for i in range(1, n_feats+1)])
results = np.zeros((n_combs, cv_n_repeats*cv_n_splits))
feature_list = []
print(results.shape)

(8191, 20)


In [11]:
ind = 0
for i in np.arange(1, n_feats+1):
    for feat_inds in itertools.combinations(np.arange(13), i):
        X_red = X[:, feat_inds]
        print(ind, X_red.shape)
        cv_scores = cross_validate(pipeline, 
                                    X_red, 
                                    y, 
                                    scoring=scoring_method, 
                                    cv=cv_outer, 
                                    return_estimator=False,
                                    n_jobs=n_jobs)
        results[ind, :] = cv_scores['test_score']
        feature_list.append(feat_inds)
        ind += 1

0 (752, 1)
1 (752, 1)
2 (752, 1)
3 (752, 1)
4 (752, 1)
5 (752, 1)
6 (752, 1)
7 (752, 1)
8 (752, 1)
9 (752, 1)
10 (752, 1)
11 (752, 1)
12 (752, 1)
13 (752, 2)
14 (752, 2)
15 (752, 2)
16 (752, 2)
17 (752, 2)
18 (752, 2)
19 (752, 2)
20 (752, 2)
21 (752, 2)
22 (752, 2)
23 (752, 2)
24 (752, 2)
25 (752, 2)
26 (752, 2)
27 (752, 2)
28 (752, 2)
29 (752, 2)
30 (752, 2)
31 (752, 2)
32 (752, 2)
33 (752, 2)
34 (752, 2)
35 (752, 2)
36 (752, 2)
37 (752, 2)
38 (752, 2)
39 (752, 2)
40 (752, 2)
41 (752, 2)
42 (752, 2)
43 (752, 2)
44 (752, 2)
45 (752, 2)
46 (752, 2)
47 (752, 2)
48 (752, 2)
49 (752, 2)
50 (752, 2)
51 (752, 2)
52 (752, 2)
53 (752, 2)
54 (752, 2)
55 (752, 2)
56 (752, 2)
57 (752, 2)
58 (752, 2)
59 (752, 2)
60 (752, 2)
61 (752, 2)
62 (752, 2)
63 (752, 2)
64 (752, 2)
65 (752, 2)
66 (752, 2)
67 (752, 2)
68 (752, 2)
69 (752, 2)
70 (752, 2)
71 (752, 2)
72 (752, 2)
73 (752, 2)
74 (752, 2)
75 (752, 2)
76 (752, 2)
77 (752, 2)
78 (752, 2)
79 (752, 2)
80 (752, 2)
81 (752, 2)
82 (752, 2)
83 (752, 2)
84

In [24]:
col_names = ["fold" + f"{i + 1:02d}" for i in range(cv_n_repeats*cv_n_splits)]
df = pd.DataFrame(results, columns=col_names)
df["feature_inds"] = feature_list
mean = np.mean(df[col_names], axis=1)
std = np.std(df[col_names], axis=1)
comb_n_feats = [len(i) for i in feature_list]
df["mean"] = mean
df["std"] = std
df["n_features"] = comb_n_feats
col_names = ["feature_inds", "n_features", "mean", "std"] + col_names
df = df[col_names]
df.to_csv("../output_files/exhaustiveSelectionResultsLDA.csv", index=False)
df

Unnamed: 0,feature_inds,n_features,mean,std,fold01,fold02,fold03,fold04,fold05,fold06,...,fold11,fold12,fold13,fold14,fold15,fold16,fold17,fold18,fold19,fold20
0,"(0,)",1,0.447372,0.053722,0.419835,0.557362,0.441176,0.446480,0.478476,0.455381,...,0.496333,0.459592,0.480231,0.422739,0.473477,0.499415,0.450820,0.316436,0.483810,0.388596
1,"(1,)",1,0.432827,0.058525,0.373656,0.442334,0.534041,0.423103,0.393778,0.335619,...,0.433180,0.442593,0.412263,0.348606,0.440717,0.462185,0.489553,0.576886,0.423656,0.375390
2,"(2,)",1,0.370803,0.053193,0.387545,0.413943,0.328930,0.293333,0.288714,0.344269,...,0.335520,0.446834,0.371114,0.333158,0.428523,0.434029,0.381572,0.333333,0.286458,0.388121
3,"(3,)",1,0.508014,0.043824,0.471138,0.408465,0.523297,0.603581,0.546861,0.547980,...,0.506225,0.554515,0.529620,0.539344,0.478962,0.505179,0.493879,0.542424,0.507983,0.462963
4,"(4,)",1,0.436959,0.053821,0.425000,0.375347,0.410980,0.514049,0.412022,0.422047,...,0.379476,0.502297,0.475508,0.494745,0.386905,0.400314,0.495082,0.461749,0.383952,0.418333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8186,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)",12,0.945443,0.031639,0.933333,0.932660,0.893557,0.964519,0.924603,0.963285,...,0.966583,0.933333,0.929630,0.963285,0.893557,0.889855,0.909378,1.000000,1.000000,0.965899
8187,"(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)",12,0.953572,0.036143,0.933333,0.965899,0.859259,0.964519,0.963285,0.963285,...,0.966583,0.933333,0.929630,0.963285,0.964519,0.893557,0.909378,1.000000,1.000000,0.930556
8188,"(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)",12,0.943602,0.035128,0.933333,0.965899,0.859259,0.964519,0.924603,0.963285,...,0.966583,0.933333,0.894737,0.963285,0.928030,0.889855,0.909378,1.000000,1.000000,0.930556
8189,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)",12,0.936808,0.039279,0.899749,0.883759,0.893557,0.964519,0.909378,0.963285,...,0.933333,0.933333,0.912341,0.963285,0.928030,0.889855,0.850458,1.000000,1.000000,0.965899
