In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import warnings
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from pca_viz import do_PCA
from compare import compare_classification_features

In [2]:
RANDOM_STATE = 42

In [3]:
path = './raw/'

# Musk dataset
df = pd.read_csv(path + 'musk_ver2/clean2.data', header=None)
column_names = ['mol_name', 'conf_name']
column_names.extend(list(range(1, 163)))
column_names.extend(['oxy_dis', 'oxy_x', 'oxy_y', 'oxy_z', 'class_'])
df.columns = column_names
y_data_musk = df.class_.astype('int64')
x_data_musk = df.drop(['class_', 'mol_name', 'conf_name'], axis=1)

# colposcopy dataset
df_green = pd.read_csv(path + 'colposcopy/green.csv')
df_hinselmann = pd.read_csv(path + 'colposcopy/hinselmann.csv')
df_schiller = pd.read_csv(path + 'colposcopy/schiller.csv')
df = df_green.append([df_hinselmann, df_schiller])
df = df.reset_index()
del df['index']
y_data_colposcopy = df.consensus
# columns 62 to 68, starting with "experts", are also target labels.
# the column 'consensus' is made from these columns
x_data_colposcopy = df.iloc[:,:62]

# Z-Alizadeh Sani CAD diagnosis dataset
df = pd.read_excel(path + 'CAD_diagnosis/CAD_diagnosis.xlsx')
y_data_cad = df.Cath.apply(lambda x: 1 if x == 'Cad' else 0)
x_data_cad = pd.get_dummies(df.drop('Cath', axis=1), drop_first=True, 
                        dtype='int64')

# Spambase dataset
df = pd.read_csv(path + 'spambase/spambase.data', header=None)
df.head()
y_data_spam = df[57]
x_data_spam = df.drop(57, axis=1)

# sports articles for objectivity analysis dataset
df = pd.read_csv(path + 'sports_articles_objectivity/features.csv')
df = df.drop(['TextID', 'URL'], axis=1)
y_data_sports = df.Label.apply(lambda x: 1 if x == 'subjective' else 0)
x_data_sports = df.drop('Label', axis=1)

# sonar detection. mines vs rocks dataset
df = pd.read_csv(path + 'sonar_mines_rocks/sonar.all-data', header=None)
df.head()
y_data_sonar = df[60].apply(lambda x: 1 if x == 'R' else 0)
x_data_sonar = df.iloc[:,:60]

# first-order theorem proving dataset
df = pd.read_csv(path + 'first_order_theorem_proving/train.csv', header=None)
df = df.append(pd.read_csv(
    path + 'first_order_theorem_proving/test.csv', header=None
))
df = df.append(pd.read_csv(
    path + 'first_order_theorem_proving/validation.csv', header=None
))
y_data_thm = df[56].apply(lambda x: 1 if x == 1 else 0)
x_data_thm = df.iloc[:,:51]

# secom dataset
y_data = pd.read_csv(
    path + 'secom/secom_labels.data', delimiter=' ', header=None
)
x_data = pd.read_csv(path + 'secom/secom.data', delimiter=' ',header=None)
y_data_scm = y_data[0].apply(lambda x: 1 if x == 1 else 0)
x_data_scm = x_data.fillna(x_data.mean())

# Epileptic seizure recognition dataset 
df = pd.read_csv(path + 'epileptic_seizure/data.csv')
y_data_epi = df['y'].apply(lambda x: 1 if x == 1 else 0)
x_data_epi = df.drop(['y', 'Unnamed: 0'], axis=1)

# Santander customer satisfaction dataset
df = pd.read_csv(path + 'santander_customer_satisfaction/train.csv')
y_data_san = df.TARGET
x_data_san = df.drop(['TARGET', 'ID'], axis=1)

y_datas = [
    y_data_musk, y_data_colposcopy, y_data_cad, y_data_spam, y_data_sports,
    y_data_sonar, y_data_thm, y_data_scm, y_data_epi, y_data_san
]
x_datas_original = [
    x_data_musk, x_data_colposcopy, x_data_cad, x_data_spam, x_data_sports,
    x_data_sonar, x_data_thm, x_data_scm, x_data_epi, x_data_san
]

In [4]:
# standardize all the features
x_datas_std = []
for x_data in x_datas_original:
    x_datas_std.append(pd.DataFrame(StandardScaler().fit_transform(x_data)))

In [5]:
# generate groups of features for each dataset. 
# The first two groups come from PCA, with the first being the top 5 principle
# components (PCs) and the second the 6th to 10th PCs in terms of explained
# variance.
# The last 5 are groups of 5 randomly drawn columns from the dataset, without
# replacement.
x_datas_features_groups = []
for x_data in x_datas_std:
    # get the PCs
    _, pc_data = do_PCA(10, x_data, random_state=RANDOM_STATE)
    pca_top_5 = pc_data.iloc[:, :5]
    pca_next_5 = pc_data.iloc[:, 5:10]

    # get random groups of 5 columns
    column_indices = np.array(range(len(x_data.columns)))
    np.random.shuffle(column_indices)
    rand_features_1 = x_data[column_indices[:5]]
    rand_features_2 = x_data[column_indices[5:10]]
    rand_features_3 = x_data[column_indices[10:15]]
    rand_features_4 = x_data[column_indices[15:20]]
    rand_features_5 = x_data[column_indices[20:25]]
    
    # This will be the various groups of features of one dataset (each group 
    # to be used in a model instance)
    feature_groups_list = [
        pca_top_5, pca_next_5, rand_features_1, rand_features_2, 
        rand_features_3, rand_features_4, rand_features_5
    ]
    # This will be the list containing data from all the datasets
    x_datas_features_groups.append(feature_groups_list)

Number of PCs: 10
Total explained variance: 0.7336116567646198
PCA completed
Number of PCs: 10
Total explained variance: 0.8233506321285604
PCA completed
Number of PCs: 10
Total explained variance: 0.427786382709891
PCA completed
Number of PCs: 10
Total explained variance: 0.3806001048512228
PCA completed
Number of PCs: 10
Total explained variance: 0.7457095255196374
PCA completed
Number of PCs: 10
Total explained variance: 0.739275479954541
PCA completed
Number of PCs: 10
Total explained variance: 0.7876884236878547
PCA completed
Number of PCs: 10
Total explained variance: 0.2604670756508573
PCA completed
Number of PCs: 10
Total explained variance: 0.4421220783698221
PCA completed
Number of PCs: 10
Total explained variance: 0.37902430591332814
PCA completed


In [None]:
warnings.filterwarnings('ignore')
results = []
# loop thru all the datasets 
for i in range(10):
    print(f'Doing dataset {i + 1}')
    # compare_classification_features() automatically loops thru all feature
    # groups 
    result = compare_classification_features(
        x_data_list=x_datas_features_groups[i],
        y_data=y_datas[i],
        num_folds=10,
        random_state=RANDOM_STATE,
        verbose=False
    )
    results.append(result)

Doing dataset 1
Doing dataset 2
Doing dataset 3
Doing dataset 4
Doing dataset 5
Doing dataset 6
Doing dataset 7
Doing dataset 8
Doing dataset 9
Doing dataset 10


In [None]:
df_r = pd.DataFrame(results)
df_r.index.name = 'dataset'
df_r.columns = [
    'top_5_PCs', 'next_5_PCs', 'rand_features_1', 
    'rand_features_2', 'rand_features_3', 
    'rand_features_4', 'rand_features_5'
]
df_r['rand_features_mean'] = df_r.iloc[:, 2:].mean(axis=1)
df_r = df_r[['top_5_PCs', 'next_5_PCs', 'rand_features_mean']]
df_r['top_5_wins'] = df_r.top_5_PCs > df_r.next_5_PCs
df_r

In [None]:
y_data_scm.value_counts

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
_, pc_data = do_PCA(10, x_data_scm, random_state=RANDOM_STATE)
pca_top_5 = pc_data.iloc[:, :5]
pca_next_5 = pc_data.iloc[:, 5:10]

rfc = RandomForestClassifier(n_estimators=100,
                             random_state=RANDOM_STATE)
lr = LogisticRegression(C=1000, random_state=RANDOM_STATE,
                        solver='liblinear', max_iter=500)
knn = KNeighborsClassifier(n_neighbors=20, weights='distance')

eclf = VotingClassifier(
    estimators=[('rfc', rfc), ('lr', lr),
                ('knn', knn)],
    voting='soft'
)
kf = KFold(n_splits=3, random_state=RANDOM_STATE, shuffle=True)
for train_index, test_index in kf.split(x_data_scm):
    x_train = x_data_scm.iloc[train_index]
    y_train = y_data_scm.iloc[train_index]
    x_test = x_data_scm.iloc[test_index]
    y_test = y_data_scm.iloc[test_index]

    eclf.fit(x_train, y_train)
    train_pred = eclf.predict(x_train)
    test_pred = eclf.predict(x_test)
    train_score = roc_auc_score(y_train, train_pred)
    test_score = roc_auc_score(y_test, test_pred)

    print(f'train_score: {train_score}')
    print(f'test_score: {test_score}')

print(train_pred)
print(test_pred)