One instructor told me it's just that people know it from experience. But when I asked him for a source, he couldn't point to any study whatsoever.

So I thought, why not do an empirical study myself? So here it goes:

### Method
I collected 10 datasets, 9 from the UCI repo and 1 from Kaggle. All involve binary classification tasks. 
For each of them I'll extract 8 sets of features:
1. The 5 PCs with the highest explained variance
2. The 6th to 10th PCs in terms of explained variance

3-8. Groups of 5 random features draw

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import warnings
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from pca_viz import do_PCA
from compare import compare_classification_features

In [2]:
RANDOM_STATE = 42

In [25]:
path = './raw/'

# Musk dataset
df = pd.read_csv(path + 'musk_ver2/clean2.data', header=None)
column_names = ['mol_name', 'conf_name']
column_names.extend(list(range(1, 163)))
column_names.extend(['oxy_dis', 'oxy_x', 'oxy_y', 'oxy_z', 'class_'])
df.columns = column_names
y_data_musk = df.class_.astype('int64')
x_data_musk = df.drop(['class_', 'mol_name', 'conf_name'], axis=1)

# colposcopy dataset
df_green = pd.read_csv(path + 'colposcopy/green.csv')
df_hinselmann = pd.read_csv(path + 'colposcopy/hinselmann.csv')
df_schiller = pd.read_csv(path + 'colposcopy/schiller.csv')
df = df_green.append([df_hinselmann, df_schiller])
df = df.reset_index()
del df['index']
y_data_colposcopy = df.consensus
# columns 62 to 68, starting with "experts", are also target labels.
# the column 'consensus' is made from these columns
x_data_colposcopy = df.iloc[:,:62]

# Z-Alizadeh Sani CAD diagnosis dataset
df = pd.read_excel(path + 'CAD_diagnosis/CAD_diagnosis.xlsx')
y_data_cad = df.Cath.apply(lambda x: 1 if x == 'Cad' else 0)
x_data_cad = pd.get_dummies(df.drop('Cath', axis=1), drop_first=True, 
                        dtype='int64')

# Spambase dataset
df = pd.read_csv(path + 'spambase/spambase.data', header=None)
df.head()
y_data_spam = df[57]
x_data_spam = df.drop(57, axis=1)

# sports articles for objectivity analysis dataset
df = pd.read_csv(path + 'sports_articles_objectivity/features.csv')
df = df.drop(['TextID', 'URL'], axis=1)
y_data_sports = df.Label.apply(lambda x: 1 if x == 'subjective' else 0)
x_data_sports = df.drop('Label', axis=1)

# sonar detection. mines vs rocks dataset
df = pd.read_csv(path + 'sonar_mines_rocks/sonar.all-data', header=None)
df.head()
y_data_sonar = df[60].apply(lambda x: 1 if x == 'R' else 0)
x_data_sonar = df.iloc[:,:60]

# first-order theorem proving dataset
df = pd.read_csv(path + 'first_order_theorem_proving/train.csv', header=None)
df = df.append(pd.read_csv(
    path + 'first_order_theorem_proving/test.csv', header=None
))
df = df.append(pd.read_csv(
    path + 'first_order_theorem_proving/validation.csv', header=None
))
y_data_thm = df[56].apply(lambda x: 1 if x == 1 else 0)
x_data_thm = df.iloc[:,:51]

# secom dataset
y_data = pd.read_csv(
    path + 'secom/secom_labels.data', delimiter=' ', header=None
)
x_data = pd.read_csv(path + 'secom/secom.data', delimiter=' ',header=None)
y_data_scm = y_data[0].apply(lambda x: 1 if x == 1 else 0)
x_data_scm = x_data.fillna(x_data.mean())

# Epileptic seizure recognition dataset 
df = pd.read_csv(path + 'epileptic_seizure/data.csv')
y_data_epi = df['y'].apply(lambda x: 1 if x == 1 else 0)
x_data_epi = df.drop(['y', 'Unnamed: 0'], axis=1)

# Santander customer satisfaction dataset
df = pd.read_csv(path + 'santander_customer_satisfaction/train.csv')
y_data_san = df.TARGET
x_data_san = df.drop(['TARGET', 'ID'], axis=1)

y_datas = [
    y_data_musk, y_data_colposcopy, y_data_cad, y_data_spam, y_data_sports,
    y_data_sonar, y_data_thm, y_data_scm, y_data_epi, y_data_san
]
x_datas_original = [
    x_data_musk, x_data_colposcopy, x_data_cad, x_data_spam, x_data_sports,
    x_data_sonar, x_data_thm, x_data_scm, x_data_epi, x_data_san
]

In [31]:
# standardize all the features
x_datas_std = []
for x_data in x_datas_original:
    x_datas_std.append(pd.DataFrame(StandardScaler().fit_transform(x_data)))

1
2
3
4
5
6
7
8
9
10


In [39]:
# generate groups of features for each dataset. 
# The first two groups come from PCA, with the first being the top 5 principle
# components (PCs) and the second the 6th to 10th PCs in terms of explained
# variance.
# The last 5 are groups of 5 randomly drawn columns from the dataset, without
# replacement.
x_datas_features_groups = []
for x_data in x_datas_std:
    # get the PCs
    _, pc_data = do_PCA(10, x_data, random_state=RANDOM_STATE)
    pca_top_5 = pc_data.iloc[:, :5]
    pca_next_5 = pc_data.iloc[:, 5:10]

    # get random groups of 5 columns
    column_indices = np.array(range(len(x_data.columns)))
    np.random.shuffle(column_indices)
    rand_features_1 = x_data[column_indices[:5]]
    rand_features_2 = x_data[column_indices[5:10]]
    rand_features_3 = x_data[column_indices[10:15]]
    rand_features_4 = x_data[column_indices[15:20]]
    rand_features_5 = x_data[column_indices[20:25]]
    
    # This will be the various groups of features of one dataset (each group 
    # to be used in a model instance)
    feature_groups_list = [
        pca_top_5, pca_next_5, rand_features_1, rand_features_2, 
        rand_features_3, rand_features_4, rand_features_5
    ]
    # This will be the list containing data from all the datasets
    x_datas_features_groups.append(feature_groups_list)

Number of PCs: 10
Total explained variance: 0.7336116567646198
PCA completed
Number of PCs: 10
Total explained variance: 0.8233506321285604
PCA completed
Number of PCs: 10
Total explained variance: 0.427786382709891
PCA completed
Number of PCs: 10
Total explained variance: 0.3806001048512228
PCA completed
Number of PCs: 10
Total explained variance: 0.7457095255196374
PCA completed
Number of PCs: 10
Total explained variance: 0.739275479954541
PCA completed
Number of PCs: 10
Total explained variance: 0.7876884236878547
PCA completed
Number of PCs: 10
Total explained variance: 0.2604670756508573
PCA completed
Number of PCs: 10
Total explained variance: 0.4421220783698221
PCA completed
Number of PCs: 10
Total explained variance: 0.37902430591332814
PCA completed


In [40]:
warnings.filterwarnings('ignore')
results = []
# loop thru all the datasets 
for i in range(10):
    # compare_classification_features() automatically loops thru all feature
    # groups 
    result = compare_classification_features(
        x_data_list=x_datas_features_groups[i],
        y_data=y_datas[i],
        num_folds=3,
        random_state=RANDOM_STATE,
        verbose=False
    )
    results.append(result)

doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.010879
dtype: float64
auc score: 
train_score    1.000000
test_score     0.774702
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.778189
1             1.0    0.783410
2             1.0    0.762508
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.00000
test_score     0.00379
dtype: float64
auc score: 
train_score    1.000000
test_score     0.715673
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.712814
1             1.0    0.719972
2             1.0    0.714232
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001230
test_score     0.010057
dtype: float64
auc score: 
train_score    0.993086
test_score     0.745484
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.991669    0.752717
1        0.993883    0.749734
2        0.993706    0.734000
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001580
test_score     0.009686
dtype: float64
auc score: 
train_score    0.994583
test_score     0.785643
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.993294    0.787939
1        0.996345    0.793976
2        0.994109    0.775015
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000745
test_score     0.005770
dtype: float64
auc score: 
train_score    0.999260
test_score     0.785648
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.998510    0.791638
1        0.999269    0.785179
2        1.000000    0.780126
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001475
test_score     0.014815
dtype: float64
auc score: 
train_score    0.996487
test_score     0.755354
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.995261    0.770694
1        0.996076    0.754239
2        0.998124    0.741127
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.002433
test_score     0.003592
dtype: float64
auc score: 
train_score    0.982383
test_score     0.774862
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.984963    0.776841
1        0.980129    0.777029
2        0.982058    0.770716
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.061607
dtype: float64
auc score: 
train_score    1.000000
test_score     0.586545
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.653333
1             1.0    0.531941
2             1.0    0.574360
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.024271
dtype: float64
auc score: 
train_score    1.000000
test_score     0.509692
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.483810
1             1.0    0.531941
2             1.0    0.513326
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.050823
dtype: float64
auc score: 
train_score    1.000000
test_score     0.527296
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.585714
1             1.0    0.493243
2             1.0    0.502932
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.060224
dtype: float64
auc score: 
train_score    1.000000
test_score     0.571033
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.629524
1             1.0    0.509214
2             1.0    0.574360
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.065717
dtype: float64
auc score: 
train_score    1.000000
test_score     0.580828
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.653333
1             1.0    0.525184
2             1.0    0.563966
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.045283
dtype: float64
auc score: 
train_score    1.000000
test_score     0.542906
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.581905
1             1.0    0.493243
2             1.0    0.553571
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.00000
test_score     0.08221
dtype: float64
auc score: 
train_score    1.000000
test_score     0.594509
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.687619
1             1.0    0.531941
2             1.0    0.563966
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.028413
dtype: float64
auc score: 
train_score    1.000000
test_score     0.777292
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.794359
1             1.0    0.744492
2             1.0    0.793025
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.031115
dtype: float64
auc score: 
train_score    1.000000
test_score     0.635183
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.647179
1             1.0    0.599856
2             1.0    0.658514
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.023627
test_score     0.061791
dtype: float64
auc score: 
train_score    0.891877
test_score     0.570030
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.919137    0.540513
1        0.879191    0.641044
2        0.877304    0.528533
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.005249
test_score     0.027568
dtype: float64
auc score: 
train_score    0.996970
test_score     0.683121
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        1.000000    0.665641
1        1.000000    0.668822
2        0.990909    0.714900
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.024905
dtype: float64
auc score: 
train_score    1.000000
test_score     0.580192
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.593077
1             1.0    0.551485
2             1.0    0.596014
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.004481
test_score     0.015668
dtype: float64
auc score: 
train_score    0.985758
test_score     0.466909
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.983607    0.479231
1        0.982759    0.472222
2        0.990909    0.449275
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.050158
test_score     0.043647
dtype: float64
auc score: 
train_score    0.921445
test_score     0.491548
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.947274    0.464359
1        0.953424    0.468391
2        0.863636    0.541893
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.00008
test_score     0.00925
dtype: float64
auc score: 
train_score    0.999687
test_score     0.912043
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.999735    0.922723
1        0.999730    0.906875
2        0.999595    0.906532
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000010
test_score     0.013706
dtype: float64
auc score: 
train_score    0.999586
test_score     0.820932
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.999576    0.832072
1        0.999588    0.825098
2        0.999595    0.805626
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.005146
test_score     0.013563
dtype: float64
auc score: 
train_score    0.708816
test_score     0.686348
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.703118    0.701897
1        0.713125    0.676954
2        0.710204    0.680193
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.003922
test_score     0.019659
dtype: float64
auc score: 
train_score    0.898727
test_score     0.728046
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.897667    0.705881
1        0.895444    0.743373
2        0.903070    0.734885
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.002645
test_score     0.005866
dtype: float64
auc score: 
train_score    0.688792
test_score     0.623396
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.686318    0.628600
1        0.691581    0.624548
2        0.688476    0.617039
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.004392
test_score     0.008641
dtype: float64
auc score: 
train_score    0.865077
test_score     0.797852
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.860006    0.807806
1        0.867523    0.793483
2        0.867702    0.792268
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.063179
test_score     0.072015
dtype: float64
auc score: 
train_score    0.698617
test_score     0.656529
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.739002    0.688908
1        0.731041    0.706672
2        0.625809    0.574008
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:


auc score SD: 
train_score    0.000000
test_score     0.006331
dtype: float64
auc score: 
train_score    1.000000
test_score     0.802301
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.795002
1             1.0    0.806306
2             1.0    0.805595
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split


  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.030623
dtype: float64
auc score: 
train_score    1.000000
test_score     0.687112
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.710648
1             1.0    0.698198
2             1.0    0.652490
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000337
test_score     0.020975
dtype: float64
auc score: 
train_score    0.976023
test_score     0.727038
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.975709    0.727558
1        0.976378    0.747748
2        0.975983    0.705808
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000112
test_score     0.010926
dtype: float64
auc score: 
train_score    0.997941
test_score     0.764317
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.997976    0.776091
1        0.998031    0.754505
2        0.997817    0.762354
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.004357
test_score     0.010912
dtype: float64
auc score: 
train_score    0.973927
test_score     0.712176
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.969636    0.702880
1        0.978346    0.709459
2        0.973799    0.724190
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.028871
dtype: float64
auc score: 
train_score    1.00000
test_score     0.78846
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.756866
1             1.0    0.795045
2             1.0    0.813470
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.010023
test_score     0.006487
dtype: float64
auc score: 
train_score    0.936451
test_score     0.736530
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.946644    0.729088
1        0.936102    0.740991
2        0.926607    0.739512
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.022509
dtype: float64
auc score: 
train_score    1.000000
test_score     0.821855
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.842928
1             1.0    0.824495
2             1.0    0.798142
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.043113
dtype: float64
auc score: 
train_score    1.000000
test_score     0.647693
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.602796
1             1.0    0.651515
2             1.0    0.688767
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.025437
dtype: float64
auc score: 
train_score    1.000000
test_score     0.731847
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.722862
1             1.0    0.712121
2             1.0    0.760557
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.065023
dtype: float64
auc score: 
train_score    1.0000
test_score     0.7323
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.775493
1             1.0    0.763889
2             1.0    0.657517
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.000000
test_score     0.010647
dtype: float64
auc score: 
train_score    1.000000
test_score     0.763979
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.774671
1             1.0    0.763889
2             1.0    0.753378
doing a data 
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.061916
dtype: float64
auc score: 
train_score    1.000000
test_score     0.704484
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.754112
1             1.0    0.635101
2             1.0    0.724240
doing a data 
doing a split
doing a split


  if diff:
  if diff:
  if diff:
  if diff:


doing a split
auc score SD: 
train_score    0.00000
test_score     0.04287
dtype: float64
auc score: 
train_score    1.000000
test_score     0.725548
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.680921
1             1.0    0.766414
2             1.0    0.729307
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001328
test_score     0.004539
dtype: float64
auc score: 
train_score    0.994883
test_score     0.799495
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.994767    0.804345
1        0.993616    0.798789
2        0.996265    0.795350
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001278
test_score     0.008469
dtype: float64
auc score: 
train_score    0.994964
test_score     0.781934
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.994914    0.789361
1        0.993711    0.772710
2        0.996265    0.783730
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001054
test_score     0.007351
dtype: float64
auc score: 
train_score    0.942872
test_score     0.753370
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.942775    0.753825
1        0.943972    0.745802
2        0.941870    0.760484
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001188
test_score     0.004387
dtype: float64
auc score: 
train_score    0.993430
test_score     0.778842
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.993478    0.783853
1        0.992218    0.775698
2        0.994593    0.776974
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001483
test_score     0.005004
dtype: float64
auc score: 
train_score    0.980068
test_score     0.758336
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.980967    0.755436
1        0.980879    0.764114
2        0.978357    0.755457
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001797
test_score     0.004325
dtype: float64
auc score: 
train_score    0.985512
test_score     0.781572
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.983533    0.786526
1        0.985961    0.779641
2        0.987042    0.778549
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001286
test_score     0.010747
dtype: float64
auc score: 
train_score    0.986752
test_score     0.779836
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.985298    0.789769
1        0.987216    0.781311
2        0.987742    0.768427
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.0
test_score     0.0
dtype: float64
auc score: 
train_score    1.0
test_score     0.5
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0         0.5
1             1.0         0.5
2             1.0         0.5
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.0
test_score     0.0
dtype: float64
auc score: 
train_score    1.0
test_score     0.5
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0         0.5
1             1.0         0.5
2             1.0         0.5
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.0
test_score     0.0
dtype: float64
auc score: 
train_score    1.0
test_score     0.5
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0         0.5
1             1.0         0.5
2             1.0         0.5
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.005183
test_score     0.008320
dtype: float64
auc score: 
train_score    0.990111
test_score     0.509566
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.993056    0.513588
1        0.984127    0.500000
2        0.993151    0.515111
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:
  if diff:


auc score SD: 
train_score    0.0
test_score     0.0
dtype: float64
auc score: 
train_score    1.0
test_score     0.5
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0         0.5
1             1.0         0.5
2             1.0         0.5
doing a data 
doing a split


  if diff:
  if diff:
  if diff:


doing a split
doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.031096
test_score     0.018669
dtype: float64
auc score: 
train_score    0.940666
test_score     0.506658
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.958333    0.494908
1        0.904762    0.496881
2        0.958904    0.528185
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.0000
test_score     0.0012
dtype: float64
auc score: 
train_score    1.000000
test_score     0.499307
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.500000
1             1.0    0.497921
2             1.0    0.500000
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.008972
dtype: float64
auc score: 
train_score    1.000000
test_score     0.889683
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.891048
1             1.0    0.897893
2             1.0    0.880107
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.00000
test_score     0.01145
dtype: float64
auc score: 
train_score    1.000
test_score     0.818
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.815284
1             1.0    0.830564
2             1.0    0.808153
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.008168
dtype: float64
auc score: 
train_score    1.000000
test_score     0.844833
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.838323
1             1.0    0.853998
2             1.0    0.842177
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.013065
dtype: float64
auc score: 
train_score    1.000000
test_score     0.850334
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.855909
1             1.0    0.859687
2             1.0    0.835407
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.006804
dtype: float64
auc score: 
train_score    1.000000
test_score     0.841362
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.833655
1             1.0    0.846536
2             1.0    0.843894
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.006414
dtype: float64
auc score: 
train_score    1.000000
test_score     0.846104
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.849851
1             1.0    0.849763
2             1.0    0.838699
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000000
test_score     0.001577
dtype: float64
auc score: 
train_score    1.000000
test_score     0.850469
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             1.0    0.848834
1             1.0    0.851982
2             1.0    0.850590
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.002354
test_score     0.000325
dtype: float64
auc score: 
train_score    0.947965
test_score     0.504512
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.947958    0.504747
1        0.950323    0.504142
2        0.945614    0.504648
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.002533
test_score     0.000439
dtype: float64
auc score: 
train_score    0.947051
test_score     0.503868
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.946464    0.503452
1        0.949826    0.504327
2        0.944862    0.503826
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000951
test_score     0.000031
dtype: float64
auc score: 
train_score    0.503824
test_score     0.499966
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.502739    0.500000
1        0.504223    0.499959
2        0.504511    0.499938
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.000144
test_score     0.000000
dtype: float64
auc score: 
train_score    0.500166
test_score     0.500000
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.500249         0.5
1        0.500248         0.5
2        0.500000         0.5
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.0
test_score     0.0
dtype: float64
auc score: 
train_score    0.5
test_score     0.5
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0             0.5         0.5
1             0.5         0.5
2             0.5         0.5
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


auc score SD: 
train_score    0.001031
test_score     0.000036
dtype: float64
auc score: 
train_score    0.503572
test_score     0.499959
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.504731    0.499938
1        0.503229    0.499938
2        0.502757    0.500000
doing a data 
doing a split


  if diff:
  if diff:


doing a split


  if diff:
  if diff:


doing a split


  if diff:


auc score SD: 
train_score    0.003844
test_score     0.000915
dtype: float64
auc score: 
train_score    0.555032
test_score     0.500599
dtype: float64
Detailed results: 
      train_score  test_score
fold                         
0        0.553536    0.499548
1        0.552161    0.501220
2        0.559398    0.501029


  if diff:


In [57]:
df_r = pd.DataFrame(results)
df_r.index.name = 'dataset'
df_r.columns = [
    'top_5_PCs', 'next_5_PCs', 'rand_features_1', 
    'rand_features_2', 'rand_features_3', 
    'rand_features_4', 'rand_features_5'
]
df_r['rand_features_mean'] = df_r.iloc[:, 2:].mean(axis=1)
df_r = df_r[['top_5_PCs', 'next_5_PCs', 'rand_features_mean']]
df_r['top_5_wins'] = df_r.top_5_PCs > df_r.next_5_PCs
df_r

Unnamed: 0_level_0,top_5_PCs,next_5_PCs,rand_features_mean,top_5_wins
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.774702,0.715673,0.769398,True
1,0.586545,0.509692,0.563314,True
2,0.777292,0.635183,0.55836,True
3,0.912043,0.820932,0.698434,True
4,0.802301,0.687112,0.745704,True
5,0.821855,0.647693,0.731632,True
6,0.799495,0.781934,0.770391,True
7,0.5,0.5,0.503106,False
8,0.889683,0.818,0.84662,True
9,0.504512,0.503868,0.500105,True


In [58]:
y_data_scm.value_counts

0    1463
1     104
Name: 0, dtype: int64

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
_, pc_data = do_PCA(10, x_data_scm, random_state=RANDOM_STATE)
pca_top_5 = pc_data.iloc[:, :5]
pca_next_5 = pc_data.iloc[:, 5:10]

rfc = RandomForestClassifier(n_estimators=100,
                             random_state=RANDOM_STATE)
lr = LogisticRegression(C=1000, random_state=RANDOM_STATE,
                        solver='liblinear', max_iter=500)
knn = KNeighborsClassifier(n_neighbors=20, weights='distance')

eclf = VotingClassifier(
    estimators=[('rfc', rfc), ('lr', lr),
                ('knn', knn)],
    voting='soft'
)
kf = KFold(n_splits=3, random_state=RANDOM_STATE, shuffle=True)
for train_index, test_index in kf.split(x_data_scm):
    x_train = x_data_scm.iloc[train_index]
    y_train = y_data_scm.iloc[train_index]
    x_test = x_data_scm.iloc[test_index]
    y_test = y_data_scm.iloc[test_index]

    eclf.fit(x_train, y_train)
    train_pred = eclf.predict(x_train)
    test_pred = eclf.predict(x_test)
    train_score = roc_auc_score(y_train, train_pred)
    test_score = roc_auc_score(y_test, test_pred)

    print(f'train_score: {train_score}')
    print(f'test_score: {test_score}')

print(train_pred)
print(test_pred)

Number of PCs: 10
Total explained variance: 0.9793819977377975
PCA completed


  if diff:
  if diff:


train_score: 1.0
test_score: 0.4989816700610998


  if diff:
  if diff:


train_score: 1.0
test_score: 0.498960498960499


  if diff:


train_score: 1.0
test_score: 0.4989816700610998
[0 1 0 ... 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

  if diff:
