In [1]:
import RENT 
import BIC_hyperparameter_search
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('data\\c0\\class_train.csv').iloc[:,1:]
train_labels = pd.read_csv('data\\c0\\class_train_labels.csv').iloc[:,1]
test_data = pd.read_csv('data\\c0\\class_test.csv').iloc[:,1:]
test_labels = pd.read_csv('data\\c0\\class_test_labels.csv').iloc[:,1]

# the data was generated with the following code

#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split
#    
#data = make_classification(n_samples=250, n_features=1000, n_informative=20, n_redundant=100, random_state=0, shuffle=False)
#
#my_data = pd.DataFrame(data[0])
#my_target = data[1]
#my_feat_names = ['f{0}'.format(x+1) for x in range(len(my_data.columns))]
#
#train_data, test_data, train_labels, test_labels = train_test_split(my_data, my_target, test_size=0.3, random_state=0)

In [3]:
# Define a range of regualarization parameters for the penalty term. A minimum of at least one value is required.
my_C_params = [0.1, 1, 10]

# Define a range of l1-ratios for elastic net.  A minimum of at least one value is required.
my_l1_ratios = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1]

# Define setting for RENT
model = RENT.RENT_Classification(data=train_data, 
                                 target=train_labels, 
                                 feat_names=train_data.columns, 
                                 C=my_C_params,
                                 l1_ratios=my_l1_ratios,
                                 autoEnetParSel=True,
                                 BIC=True,
                                 poly='OFF',
                                 testsize_range=(0.25,0.25),
                                 scoring='mcc',
                                 classifier='logreg',
                                 K=100,
                                 random_state=0,
                                 verbose=1)

data dimension: (175, 1000)  data type: <class 'pandas.core.frame.DataFrame'>
target dimension: (175,)
regularization parameters C: [0.1, 1, 10]
elastic net l1_ratios: [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1]
poly: OFF
number of models in ensemble: 100
random state: 0
verbose: 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


classifier: logreg
scoring: mcc


[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    2.8s finished


In [4]:
model._BIC_df

Unnamed: 0,0.1,1.0,10.0
0.0,5180.528134,5172.763243,5172.077357
0.1,2111.615137,4388.93376,5167.008545
0.25,1108.488513,3399.363997,5053.533439
0.5,554.683852,2519.693602,4816.218837
0.75,407.937528,2016.873843,4640.898728
0.9,373.822666,1750.154084,4465.472747
1.0,323.260014,1642.904857,4367.46197


In [5]:
model.train()

In [6]:
model.get_enet_params()

(0.1, 1.0)

In [7]:
cutoff_parameters = {'t1': np.arange(0.2,1.05,0.05),
                     't2': np.arange(0.2,1.05,0.05),
                     't3': [0.9, 0.95, 0.975, 0.99]}

result = BIC_hyperparameter_search.BIC_hyperparameter_search(model, cutoff_parameters, test_data, test_labels)

In [8]:
BIC = result['BIC']
#indices = np.where(BIC == np.min(BIC))
indices_matrix = np.stack(np.where(BIC == np.min(BIC)), axis=0)
tau1 = np.max(indices_matrix[0,:])
indices_matrix = indices_matrix[:,np.where(indices_matrix[0,:] == tau1)[0]]
tau2 = np.max(indices_matrix[1,:])
indices_matrix = indices_matrix[:,np.where(indices_matrix[1,:] == tau2)[0]]
tau3 = np.max(indices_matrix[2,:])

tau1 = np.round(cutoff_parameters['t1'][tau1], 2)
tau2 = np.round(cutoff_parameters['t2'][tau2], 2)
tau3 = np.round(cutoff_parameters['t3'][tau3], 2)


selected_features = model.select_features(tau_1_cutoff=tau1, tau_2_cutoff=tau2, tau_3_cutoff=tau3)

In [9]:
selected_features

array([  2,  24,  42,  47,  53,  66,  70,  91, 147, 168, 226, 350],
      dtype=int64)

In [10]:
sc = StandardScaler()
train_data_sc = sc.fit_transform(train_data.iloc[:, selected_features])
test_data_sc = sc.transform(test_data.iloc[:, selected_features])

# Train model with 
model = LogisticRegression(penalty='none', max_iter=8000, solver="saga", random_state=0).\
        fit(train_data_sc, train_labels)

In [11]:
def predictions():
    print("f1 1: ", f1_score(test_labels, model.predict(test_data_sc)))
    print("f1 0: ", f1_score(1 - test_labels, 1 - model.predict(test_data_sc)))
    print("Accuracy: ", accuracy_score(test_labels, model.predict(test_data_sc)))
    print("Matthews correlation coefficient: ", matthews_corrcoef(test_labels, model.predict(test_data_sc)))

In [12]:
predictions()

f1 1:  0.7532467532467533
f1 0:  0.7397260273972601
Accuracy:  0.7466666666666667
Matthews correlation coefficient:  0.49536820303070234


# Competing methods

In [13]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from ReliefF import ReliefF

In [14]:
# M0

sc=StandardScaler()
scaled_data = sc.fit_transform(train_data)

sgd = LogisticRegression(penalty="elasticnet", C=0.1, l1_ratio=1, solver="saga", random_state=0)
sgd.fit(scaled_data, train_labels)
params = np.where(sgd.coef_ != 0)[1]

sc = StandardScaler()
train_data_sc = sc.fit_transform(train_data.iloc[:,params])
test_data_sc = sc.transform(test_data.iloc[:, params])
model = LogisticRegression(penalty='none', max_iter=8000, solver="saga", random_state=0).\
        fit(train_data_sc,train_labels)

predictions()

f1 1:  0.6301369863013698
f1 0:  0.6493506493506493
Accuracy:  0.64
Matthews correlation coefficient:  0.29074308227141493


In [15]:
num_features = len(selected_features)

In [16]:
# RFE

svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1)
rfe.fit(scaled_data, train_labels)
params = np.where(rfe.ranking_ == 1)[0]

sc = StandardScaler()
train_data_sc = sc.fit_transform(train_data.iloc[:,params])
test_data_sc = sc.transform(test_data.iloc[:, params])
model = LogisticRegression(penalty='none', max_iter=8000, solver="saga").\
        fit(train_data_sc,train_labels)

predictions()

f1 1:  0.7
f1 0:  0.6571428571428571
Accuracy:  0.68
Matthews correlation coefficient:  0.35714285714285715


In [17]:
# reliefF
rfF = ReliefF(n_neighbors=np.round(np.shape(train_data)[0]/3).astype(int), n_features_to_keep=num_features)
rfF.fit(scaled_data, train_labels)
params = rfF.top_features[0:num_features]


sc = StandardScaler()
train_data_sc = sc.fit_transform(train_data.iloc[:,params])
test_data_sc = sc.transform(test_data.iloc[:, params])
model = LogisticRegression(penalty='none', max_iter=8000, solver="saga", random_state=0).\
        fit(train_data_sc,train_labels)

predictions()

f1 1:  0.5277777777777778
f1 0:  0.5641025641025641
Accuracy:  0.5466666666666666
Matthews correlation coefficient:  0.10447076320889258


In [18]:
# random forest
clf = RandomForestClassifier(random_state=0)
model = clf.fit(scaled_data, train_labels)
ind = np.where(np.argsort(np.argsort(-1*(model.feature_importances_)))<num_features)[0]

sc = StandardScaler()
train_data_sc = sc.fit_transform(train_data.iloc[:,ind])
test_data_sc = sc.transform(test_data.iloc[:, ind])
model = LogisticRegression(penalty='none', max_iter=8000, solver="saga").\
        fit(train_data_sc,train_labels)

predictions()

f1 1:  0.7466666666666666
f1 0:  0.7466666666666666
Accuracy:  0.7466666666666667
Matthews correlation coefficient:  0.5
