In [1]:
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter
from pycm import ConfusionMatrix
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import balanced_accuracy_score as bal_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Load data

In [2]:
# Load .csv files

path = r'Data/Train'
filename = ['Descriptor_global_train.csv', 'Descriptor_atom_train.csv', 'Descriptor_bond_train.csv', 'Descriptor_metal_train.csv',
            'Descriptor_linker_RDKit_train.csv', 'Descriptor_linker_MACCS_train.csv',
            'Descriptor_RACs_train.csv', 'Target_train_1035.csv']

load_data = []
for f in filename:
    if f != filename[-1]:
        load_data.append(pd.read_csv(filepath_or_buffer=os.path.join(path,f),index_col=0).astype(float))
    else:
        load_data.append(pd.read_csv(filepath_or_buffer=os.path.join(path,f),index_col=0))

load_data_content = {'Global':0 , 'Atom': 1, 'Bond': 2, 'Metal': 3, 'Linker_RDKit':4, 'Linker_MACCS':5,'RACs': 6, 'Target': 7}


# Load classifier

In [3]:
# Classifier parameter

ncvfold = 5
scoring = 'f1_macro'
param_grid = {'classifier1': {"n_estimators": [50,100,150,200], "max_depth": [15,25,35,45], "max_features": [2,3,4]},        # classifier 1
              'classifier2': {"n_estimators": [50,200,400], "max_depth": [10,20,30], "max_features": [2,6,10,14]}}           # classifier 2

# SMOTE parameter

smote_grid = {'classifier1': SMOTE(sampling_strategy="auto", random_state=0, k_neighbors=5),                                 # classifier 1
              'classifier2': SMOTE(sampling_strategy="auto", random_state=0, k_neighbors=5)}                                 # classifier 2

In [4]:
def select_classification(select_data, select_data_content, select_descriptor, select_classifier):
    
    # (1) Build dataset for training
    
    all_index = [select_data[select_data_content[i]].index.tolist() for i in select_descriptor]
    common_index = list(set.intersection(*[set(list_) for list_ in all_index]))
    
    output_train = pd.DataFrame()
    for i in select_descriptor:
        output_train = pd.concat([output_train,select_data[select_data_content[i]].loc[common_index,:]], axis=1, ignore_index=False)
    output_train = pd.concat([output_train,select_data[-1].loc[common_index,'stability']], axis=1, ignore_index=False)
    output_train.sort_index(key=lambda x: [int(i[3:]) for i in x], inplace=True)
    
    # (2) Classification strategy
    
    file = os.path.join(r'Data/Descriptor',select_classifier+'_descriptor.csv')
    selXcols = [str(i) for i in list(pd.read_csv(file)['descriptor'])]
    
    if select_classifier == 'classifier1':
        # Classifier 1: 3 / 2, 1, 0
        output_train.loc[output_train['stability']==3,'stability']=-1
        output_train.loc[output_train['stability']==2,'stability']=1
        output_train.loc[output_train['stability']==1,'stability']=1
        output_train.loc[output_train['stability']==0,'stability']=1

    elif select_classifier == 'classifier2':
        # Classfier 2: 3, 2 / 1, 0
        output_train.loc[output_train['stability']==3,'stability']=-1
        output_train.loc[output_train['stability']==2,'stability']=-1
        output_train.loc[output_train['stability']==1,'stability']=1
        output_train.loc[output_train['stability']==0,'stability']=1
    
    output_X = output_train.iloc[:,:-1]
    output_Y = output_train.iloc[:,-1]
    output_X = output_X[selXcols].copy()
    output_X.sort_index(axis=1,inplace=True)
    output_train = output_train[selXcols]

    # (3) Classifier
    
    output_rf_clf = RandomForestClassifier(class_weight="balanced_subsample", oob_score=True, random_state=0, n_jobs=-1)
    output_CLF = GridSearchCV(output_rf_clf, cv = StratifiedKFold(n_splits=ncvfold, random_state=0, shuffle=True),
                              param_grid = param_grid[select_classifier], scoring=scoring, n_jobs=-1)

    # (4) SMOTE

    output_smote = smote_grid[select_classifier]
    
    print(select_classifier, ':')
    print('Stable MOF: ', Counter(output_Y)[-1], ', Unstable MOF: ', Counter(output_Y)[1])
    print('Feature space dimension:', output_X.shape)
    return output_train, output_X, output_Y, output_CLF, output_smote
    

In [5]:
def metric(input_seed, input_type, input_true, input_pred):
    
    output_metric_all = pd.DataFrame()
    output_metric_all.loc[input_seed,input_type+'_ACC(w)'] = bal_score(y_true=input_true, y_pred=input_pred)
    output_metric_all.loc[input_seed,input_type+'_ACC(u)'] = accuracy(y_true=input_true, y_pred=input_pred)
    output_metric_all.loc[input_seed,input_type+'_PPV'] = precision_score(y_true=input_true, y_pred=input_pred, pos_label=-1)
    output_metric_all.loc[input_seed,input_type+'_TPR'] = recall_score(y_true=input_true, y_pred=input_pred, pos_label=-1)
    output_metric_all.loc[input_seed,input_type+'_F1'] = f1_score(y_true=input_true, y_pred=input_pred, pos_label=-1)
    output_metric_all.loc[input_seed,input_type+'_AUC'] = roc_auc_score(input_true,input_pred)

    output_metric_per_class = pd.DataFrame()
    cm = ConfusionMatrix(actual_vector=input_true, predict_vector=input_pred)
    output_metric_per_class.loc[input_seed,'AUC_S_'+input_type], output_metric_per_class.loc[input_seed,'AUC_U_'+input_type] = cm.AUC[-1], cm.AUC[1]
    output_metric_per_class.loc[input_seed,'PPV_S_'+input_type], output_metric_per_class.loc[input_seed,'PPV_U_'+input_type] = cm.PPV[-1], cm.PPV[1]
    output_metric_per_class.loc[input_seed,'TPR_S_'+input_type], output_metric_per_class.loc[input_seed,'TPR_U_'+input_type] = cm.TPR[-1], cm.TPR[1]
    output_metric_per_class.loc[input_seed,'F1_S_'+input_type], output_metric_per_class.loc[input_seed,'F1_U_'+input_type] = cm.F1[-1], cm.F1[1] 
    
    return output_metric_all, output_metric_per_class


# Train classifier: Ⅰ

In [6]:
splits = [0.8]
seeds = range(20)

select_descriptor = ['Global','Metal','Linker_MACCS','Linker_RDKit']
data_all, data_X, data_Y, CLF, smote = select_classification(select_data = load_data,
                                                             select_data_content = load_data_content, 
                                                             select_descriptor=select_descriptor,
                                                             select_classifier='classifier1')


classifier1 :
Stable MOF:  678 , Unstable MOF:  357
Feature space dimension: (1035, 75)


In [7]:
for split in tqdm(splits):

    metric_all = pd.DataFrame()
    metric_per_class = pd.DataFrame()
    
    for rnseed in tqdm(seeds, position=0):

 		# (1) Test-train split
        X_train, X_test, y_train, y_test = train_test_split(data_X.values, data_Y.values, test_size=1-split,
                                                            stratify=data_Y.values, random_state=rnseed)
        scaler = StandardScaler()
        scaler_save = scaler.fit(X_train)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
       
        # (2) SMOTE
        X_train, y_train = smote.fit_resample(X_train, y_train)
                
        # (3) Train classifier
        CLF.fit(X_train, y_train)
        CLF_best = CLF.best_estimator_ 
        
 		# (4) Evaluate classifier         
        
 		# (4.1) Training result
        pred_train = CLF_best.predict(X_train)
        m1_tr, m2_tr = metric(input_seed=rnseed, input_type='Train', input_true = y_train, input_pred = pred_train)

 		# (4.2) Test result
        pred_test = CLF_best.predict(X_test)
        m1_tt, m2_tt = metric(input_seed=rnseed, input_type='Test', input_true = y_test, input_pred = pred_test)

        metric_all = pd.concat([metric_all, pd.concat([m1_tr, m1_tt],axis=1)], axis=0)
        metric_per_class = pd.concat([metric_per_class, pd.concat([m2_tr, m2_tt],axis=1)], axis=0)
        
        # (5) Save model
        # if savemodel:        
            # filename = r'Model/classifier1.pkl'
            # pickle.dump(CLF_best, open(filename, 'wb'))		
            # filename = r'Model/classifier1_scaler.pkl'
            # pickle.dump(scaler_save, open(filename, 'wb'))

# Print metric
metric_summary = pd.concat([metric_all.mean(), metric_all.median(), metric_all.min(), metric_all.max()],axis=1)
metric_summary.columns=['mean','median','min','max']
print(metric_summary.iloc[7:,].round(2))


100%|██████████| 20/20 [02:14<00:00,  6.72s/it]
100%|██████████| 1/1 [02:14<00:00, 134.48s/it]

             mean  median   min   max
Test_ACC(u)  0.82    0.82  0.76  0.87
Test_PPV     0.86    0.86  0.83  0.91
Test_TPR     0.87    0.88  0.80  0.93
Test_F1      0.86    0.86  0.81  0.90
Test_AUC     0.80    0.79  0.74  0.86





# Train classifier: Ⅱ

In [8]:
splits = [0.8]
seeds = range(20)

select_descriptor = ['Global','Metal','Linker_MACCS','Linker_RDKit']
data_all, data_X, data_Y, CLF, smote = select_classification(select_data = load_data,
                                                             select_data_content = load_data_content, 
                                                             select_descriptor=select_descriptor,
                                                             select_classifier='classifier2')


classifier2 :
Stable MOF:  863 , Unstable MOF:  172
Feature space dimension: (1035, 60)


In [9]:
for split in tqdm(splits):

    metric_all = pd.DataFrame()
    metric_per_class = pd.DataFrame()
    
    for rnseed in tqdm(seeds, position=0):

 		# (1) Test-train split
        X_train, X_test, y_train, y_test = train_test_split(data_X.values, data_Y.values, test_size=1-split,
                                                            stratify=data_Y.values, random_state=rnseed)
        scaler = StandardScaler()
        scaler_save = scaler.fit(X_train)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
       
        # (2) SMOTE
        X_train, y_train = smote.fit_resample(X_train, y_train)
                
        # (3) Train classifier
        CLF.fit(X_train, y_train)
        CLF_best = CLF.best_estimator_ 
        
 		# (4) Evaluate classifier         
        
 		# (4.1) Training result
        pred_train = CLF_best.predict(X_train)
        m1_tr, m2_tr = metric(input_seed=rnseed, input_type='Train', input_true = y_train, input_pred = pred_train)

 		# (4.2) Test result
        pred_test = CLF_best.predict(X_test)
        m1_tt, m2_tt = metric(input_seed=rnseed, input_type='Test', input_true = y_test, input_pred = pred_test)

        metric_all = pd.concat([metric_all, pd.concat([m1_tr, m1_tt],axis=1)], axis=0)
        metric_per_class = pd.concat([metric_per_class, pd.concat([m2_tr, m2_tt],axis=1)], axis=0)
        
        # (5) Save model
        # if savemodel:        
            # filename = r'Model/classifier1.pkl'
            # pickle.dump(CLF_best, open(filename, 'wb'))		
            # filename = r'Model/classifier1_scaler.pkl'
            # pickle.dump(scaler_save, open(filename, 'wb'))

# Print metric
metric_summary = pd.concat([metric_all.mean(), metric_all.median(), metric_all.min(), metric_all.max()],axis=1)
metric_summary.columns=['mean','median','min','max']
print(metric_summary.iloc[7:,].round(2))


100%|██████████| 20/20 [04:02<00:00, 12.11s/it]
100%|██████████| 1/1 [04:02<00:00, 242.28s/it]

             mean  median   min   max
Test_ACC(u)  0.88    0.88  0.84  0.92
Test_PPV     0.91    0.90  0.88  0.94
Test_TPR     0.95    0.95  0.92  0.99
Test_F1      0.93    0.93  0.91  0.95
Test_AUC     0.73    0.72  0.63  0.81



