In [2]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder


In [3]:
# ! pip install --upgrade augmentdata

In [4]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np

In [5]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
import keras
from keras.layers import Dense

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN


Using TensorFlow backend.


In [6]:
from augmentdata import data_augment

In [7]:
# ! pip freeze

In [8]:
def check_cm_others(y_actual,y_predict):
    from sklearn.metrics import confusion_matrix

    cm1 = confusion_matrix(y_actual,y_predict)
    print('Confusion Matrix : \n', cm1)

    total1=sum(sum(cm1))
    #####from confusion matrix calculate accuracy
    accuracy1=(cm1[0,0]+cm1[1,1])/total1
    print ('Accuracy : ', accuracy1)

    tn=cm1[0,0]
    fp=cm1[0,1]
    fn=cm1[1,0]
    tp=cm1[1,1]

    sensitivity1 = tp/(tp+fn)
    print('Sensitivity : ', sensitivity1 )

    specificity1 = tn/(tn+fp)
    print('Specificity : ', specificity1)
    recall=sensitivity1
    precision=tp/(tp+fp)

    print("Precision = ",precision)
    print("Recall = ",recall)

    f1_score=2*(precision*recall)/(precision+recall)

    print("F1 score = ",f1_score)

    return sensitivity1,specificity1,f1_score

def create_model(weight_path,input_dim):
    
    checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model=None
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, 
                activation='relu')) 

    model.add(Dense(20, 
                activation='relu'))
    model.add(Dense(20, 
                activation='relu'))

    model.add(Dense(20, 
                activation='relu'))

    model.add(Dense(20, 
                activation='relu'))
    model.add(Dense(20, 
                activation='relu'))

    model.add(Dense(2, 
                activation='softmax'))
    opt=keras.optimizers.Adam(lr=.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)



    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    return model, callbacks_list
    

### Data pre processing

In [9]:
df=pd.read_csv("data/diabetes.csv")

In [10]:
df.shape

(768, 9)

In [11]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [13]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [14]:
# here take 20% out for testing
np.random.seed(42)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [15]:
print(len(test))
print(len(train))
print(test.shape)
print(train.shape)

159
609
(159, 9)
(609, 9)


In [16]:
print(test["Outcome"].value_counts())
print(train["Outcome"].value_counts())


0    115
1     44
Name: Outcome, dtype: int64
0    385
1    224
Name: Outcome, dtype: int64


In [17]:
columns=df.columns
print(columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


### Train with no augmentation

In [18]:
results={}
smote_results={}

adasyn_results={}

In [None]:
results["0"]={}
smote_results["0"]={}
adasyn_results["0"]={}
source=train[train.columns[:-1]]
print(source.shape)


weight_path="weights/wt1.hdf5"
model,callbacks_list=create_model(weight_path,source.shape[1])
target = list(train["Outcome"])
target=pd.get_dummies(target)
start=time.time()
print("Training MLP")
history = model.fit(source.values, target,epochs=200,validation_split=0.2,callbacks=callbacks_list,verbose=0)
end=time.time()
difference = int(end - start)
print("Time taken to train = ",difference)


plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','validation'], loc='upper left')
plt.show()

# load weights
model.load_weights(weight_path)
# Compile model (required to make predictions)
opt=keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

print("used model and loaded weights from file")

print("Trying MLP")
results["0"]["MLP"]={}
smote_results["0"]["MLP"]={}
adasyn_results["0"]["MLP"]={}

y_actual = list(test["Outcome"])
y_actual=pd.get_dummies(y_actual)
test_features_only=test[test.columns[:-1]]
_, accuracy = model.evaluate(test_features_only.values, y_actual)
print('Accuracy: %.2f' % (accuracy*100))

y_actual = test["Outcome"].astype(int)
y_actual=np.asarray(y_actual)
y_predict=model.predict_classes(test_features_only.values)
sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
results["0"]["MLP"]["F1"]=f1_score
results["0"]["MLP"]["sensitivity"]=sensitivity
results["0"]["MLP"]["specificity"]=specificity

smote_results["0"]["MLP"]["F1"]=f1_score  
smote_results["0"]["MLP"]["sensitivity"]=sensitivity
smote_results["0"]["MLP"]["specificity"]=specificity

adasyn_results["0"]["MLP"]["F1"]=f1_score  
adasyn_results["0"]["MLP"]["sensitivity"]=sensitivity
adasyn_results["0"]["MLP"]["specificity"]=specificity


# for other classifiers
target = list(train["Outcome"])

# Random forest
print("RF")
results["0"]["RF"]={}
smote_results["0"]["RF"]={}
adasyn_results["0"]["RF"]={}

clf=RandomForestClassifier()
clf.fit(source.values,target)
y_pred=None
y_predict=clf.predict(test_features_only.values)
sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
results["0"]["RF"]["F1"]=f1_score
results["0"]["RF"]["sensitivity"]=sensitivity
results["0"]["RF"]["specificity"]=specificity

smote_results["0"]["RF"]["F1"]=f1_score  
smote_results["0"]["RF"]["sensitivity"]=sensitivity
smote_results["0"]["RF"]["specificity"]=specificity

adasyn_results["0"]["RF"]["F1"]=f1_score  
adasyn_results["0"]["RF"]["sensitivity"]=sensitivity
adasyn_results["0"]["RF"]["specificity"]=specificity

#     Gaussian NB
gnb = GaussianNB()
results["0"]["GNB"]={}
smote_results["0"]["GNB"]={}
adasyn_results["0"]["GNB"]={}

print("GNB")
gnb.fit(source.values,target)
y_predict = gnb.predict(test_features_only.values)
sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
results["0"]["GNB"]["F1"]=f1_score
results["0"]["GNB"]["sensitivity"]=sensitivity
results["0"]["GNB"]["specificity"]=specificity

smote_results["0"]["GNB"]["F1"]=f1_score  
smote_results["0"]["GNB"]["sensitivity"]=sensitivity
smote_results["0"]["GNB"]["specificity"]=specificity

adasyn_results["0"]["GNB"]["F1"]=f1_score  
adasyn_results["0"]["GNB"]["sensitivity"]=sensitivity
adasyn_results["0"]["GNB"]["specificity"]=specificity

#     SVM
clf = svm.SVC()
results["0"]["SVM"]={}
smote_results["0"]["SVM"]={}
adasyn_results["0"]["SVM"]={}

print("SVM")
clf.fit(source.values,target)
y_predict=clf.predict(test_features_only.values)
sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
results["0"]["SVM"]["F1"]=f1_score
results["0"]["SVM"]["sensitivity"]=sensitivity
results["0"]["SVM"]["specificity"]=specificity

smote_results["0"]["SVM"]["F1"]=f1_score  
smote_results["0"]["SVM"]["sensitivity"]=sensitivity
smote_results["0"]["SVM"]["specificity"]=specificity

adasyn_results["0"]["SVM"]["F1"]=f1_score  
adasyn_results["0"]["SVM"]["sensitivity"]=sensitivity
adasyn_results["0"]["SVM"]["specificity"]=specificity






In [None]:
print(results)

print(smote_results)
print(adasyn_results)

In [None]:
train["Outcome"].value_counts()

### Train  with augmentation

In [None]:
N_range=[50,100,160]
for N in N_range:
    results[N]={}
    smote_results[N]={}
    adasyn_results[N]={}
    
    k_range=[1,2,5,10]
    for k in k_range:
        results[N][k]={}
        smote_results[N][k]={}  
        adasyn_results[N][k]={}
        
        results[N][k]["MLP"]={}
        smote_results[N][k]["MLP"]={}  
        adasyn_results[N][k]["MLP"]={}

        results[N][k]["RF"]={}
        smote_results[N][k]["RF"]={}  
        adasyn_results[N][k]["RF"]={}
        
        results[N][k]["GNB"]={}
        smote_results[N][k]["GNB"]={}  
        adasyn_results[N][k]["GNB"]={}

        results[N][k]["SVM"]={}
        smote_results[N][k]["SVM"]={}  
        adasyn_results[N][k]["SVM"]={}


        class_index=1
        randmx=.001
        dist_percent=0.2

        
        daug = data_augment.DataAugment()
        print("randmx = ",randmx)
        now = time.time()
        [Data_a,Ext_d,Ext_not]=daug.augment(data=train.values,k=k,class_ind=class_index,N=N,
                                            randmx=randmx,dist_percent=dist_percent)
        later = time.time()
        difference = int(later - now)
        print("Time taken to augment = ",difference)
        print(len(Data_a))

        train_aug=pd.DataFrame(data=Data_a,index=None,    # values                
                columns=columns)      

        print("After augmentation of ",N," items with ",k," neighbors")

        print(train_aug["Outcome"].value_counts())
        source=train_aug[train_aug.columns[:-1]]
        target = list(train_aug["Outcome"])
        target=pd.get_dummies(target)

        weight_path="weights/"+str(N)+"_"+str(k)+"_wt1.hdf5"    
        model,callbacks_list=create_model(weight_path,source.shape[1])
        start=time.time()
        print("Training model for N = ",N," k = ",k)
        history = model.fit(source.values, target,epochs=300,validation_split=0.2,callbacks=callbacks_list,verbose=0)
        end=time.time()
        difference = int(end - start)
        print("Time taken to train = ",difference)



        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train','validation'], loc='upper left')
        plt.show()    

        # load weights
        model.load_weights(weight_path)
        # Compile model (required to make predictions)
        opt=keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

        print("used model and loaded weights from file")    
        print("Test distribution")
        print(test["Outcome"].value_counts())
        print("Trying MLP")
        y_actual = list(test["Outcome"])
        y_actual=pd.get_dummies(y_actual)
        test_features_only=test[test.columns[:-1]]
        _, accuracy = model.evaluate(test_features_only.values, y_actual)
        print('Accuracy: %.2f' % (accuracy*100))            


        y_actual = test["Outcome"].astype(int)
        y_actual=np.asarray(y_actual)
        y_predict=model.predict_classes(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        results[N][k]["MLP"]["F1"]=f1_score 
        results[N][k]["MLP"]["sensitivity"]=sensitivity 
        results[N][k]["MLP"]["specificity"]=specificity         
        


        # for other classifiers
        target = list(train_aug["Outcome"])
        source=train_aug[train_aug.columns[:-1]]
        test_features_only=test[test.columns[:-1]]

        # Random forest
        print("RF")
        clf=RandomForestClassifier()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        results[N][k]["RF"]["F1"]=f1_score 
        results[N][k]["RF"]["sensitivity"]=sensitivity 
        results[N][k]["RF"]["specificity"]=specificity         
                

    #     Gaussian NB
        print("GNB")
        gnb = GaussianNB()
        gnb.fit(source.values,target)
        y_predict = gnb.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        results[N][k]["GNB"]["F1"]=f1_score 
        results[N][k]["GNB"]["sensitivity"]=sensitivity 
        results[N][k]["GNB"]["specificity"]=specificity         
                        
        

    #     SVM
        print("SVM")
        clf = svm.SVC()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        results[N][k]["SVM"]["F1"]=f1_score 
        results[N][k]["SVM"]["sensitivity"]=sensitivity 
        results[N][k]["SVM"]["specificity"]=specificity         
                                
        
        
        
        # this part for SMOTE
        print("******Starting SMOTE***********")
        source=train[train.columns[:-1]]
        print("Shape of training ",source.shape)

        y_train=train["Outcome"].values
        num_minority=sum(y_train==class_index)
        num_majority=len(y_train)-num_minority
        print(num_majority,num_minority)
        
        samp_strategy=(num_minority+N)/num_majority
        print(samp_strategy)
        now = time.time()
        sm = SMOTE(random_state=2,k_neighbors=k,sampling_strategy=samp_strategy)
        X_train_res, y_train_res = sm.fit_sample(source, y_train.ravel())
        source=X_train_res

        later = time.time()
        difference = int(later - now)
        print("Time taken to augment by SMOTE = ",difference)

        print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
        print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

        print("After OverSampling, counts of label {}: {}".format(class_index,sum(y_train_res==1)))
        print("After OverSampling, counts of other label: {}".format(len(y_train_res)-sum(y_train_res==class_index)))  
        
        weight_path="weights/smote"+"_"+str(N)+"_"+str(k)+"_wt1.hdf5"    
        model,callbacks_list=create_model(weight_path,X_train_res.shape[1])
        start=time.time()
        print("Training model")
        target=pd.get_dummies(y_train_res)
        history = model.fit(X_train_res.values, target,epochs=300,validation_split=0.2,callbacks=callbacks_list,verbose=0)
        end=time.time()
        difference = int(end - start)
        print("Time taken to train = ",difference) 

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train','validation'], loc='upper left')
        plt.show()

        

        # load weights
        model.load_weights(weight_path)
        # Compile model (required to make predictions)
        opt=keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

        print("used model and loaded weights from file")    
        print("Trying MLP")
        y_actual = list(test["Outcome"])
        y_actual=pd.get_dummies(y_actual)
        test_features_only=test[test.columns[:-1]]
        _, accuracy = model.evaluate(test_features_only.values, y_actual)
        print('Accuracy: %.2f' % (accuracy*100)) 



        y_actual = test["Outcome"].astype(int)
        y_actual=np.asarray(y_actual)
        y_predict=model.predict_classes(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        smote_results[N][k]["MLP"]["F1"]=f1_score         
        smote_results[N][k]["MLP"]["sensitivity"]=sensitivity                 
        smote_results[N][k]["MLP"]["specificity"]=specificity                 

        # for other classifiers
        target = y_train_res
        source=X_train_res
        test_features_only=test[test.columns[:-1]]

        # Random forest
        print("RF")
        clf=RandomForestClassifier()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        smote_results[N][k]["RF"]["F1"]=f1_score         
        smote_results[N][k]["RF"]["sensitivity"]=sensitivity                 
        smote_results[N][k]["RF"]["specificity"]=specificity

    #     Gaussian NB
        print("GNB")
        gnb = GaussianNB()
        gnb.fit(source.values,target)
        y_predict = gnb.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        smote_results[N][k]["GNB"]["F1"]=f1_score         
        smote_results[N][k]["GNB"]["sensitivity"]=sensitivity                 
        smote_results[N][k]["GNB"]["specificity"]=specificity

    #     SVM
        print("SVM")
        clf = svm.SVC()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        smote_results[N][k]["SVM"]["F1"]=f1_score         
        smote_results[N][k]["SVM"]["sensitivity"]=sensitivity                 
        smote_results[N][k]["SVM"]["specificity"]=specificity

        
        
        
        
        # this part for ADASYN
        print("******Starting ADASYN***********")
        source=train[train.columns[:-1]]
        print("Shape of training ",source.shape)

        y_train=train["Outcome"].values
        num_minority=sum(y_train==class_index)
        num_majority=len(y_train)-num_minority
        print(num_majority,num_minority)
        
        samp_strategy=(num_minority+N)/num_majority
        print(samp_strategy)
        now = time.time()
#         ad = ADASYN(random_state=2,n_neighbors=k,sampling_strategy=samp_strategy)
        ad = ADASYN(random_state=2,n_neighbors=k)
        X_train_res, y_train_res = ad.fit_sample(source, y_train.ravel())
        source=X_train_res

        later = time.time()
        difference = int(later - now)
        print("Time taken to augment by ADASYN = ",difference)

        print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
        print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

        print("After OverSampling, counts of label {}: {}".format(class_index,sum(y_train_res==1)))
        print("After OverSampling, counts of other label: {}".format(len(y_train_res)-sum(y_train_res==class_index)))  
        
        weight_path="weights/adasyn"+"_"+str(N)+"_"+str(k)+"_wt1.hdf5"    
        model,callbacks_list=create_model(weight_path,X_train_res.shape[1])
        start=time.time()
        print("Training model")
        target=pd.get_dummies(y_train_res)
        history = model.fit(X_train_res.values, target,epochs=300,validation_split=0.2,callbacks=callbacks_list,verbose=0)
        end=time.time()
        difference = int(end - start)
        print("Time taken to train = ",difference) 

        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train','validation'], loc='upper left')
        plt.show()

        

        # load weights
        model.load_weights(weight_path)
        # Compile model (required to make predictions)
        opt=keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

        print("used model and loaded weights from file")    
        print("Trying MLP")
        y_actual = list(test["Outcome"])
        y_actual=pd.get_dummies(y_actual)
        test_features_only=test[test.columns[:-1]]
        _, accuracy = model.evaluate(test_features_only.values, y_actual)
        print('Accuracy: %.2f' % (accuracy*100)) 



        y_actual = test["Outcome"].astype(int)
        y_actual=np.asarray(y_actual)
        y_predict=model.predict_classes(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        adasyn_results[N][k]["MLP"]["F1"]=f1_score         
        adasyn_results[N][k]["MLP"]["sensitivity"]=sensitivity                 
        adasyn_results[N][k]["MLP"]["specificity"]=specificity

        # for other classifiers
        target = y_train_res
        source=X_train_res
        test_features_only=test[test.columns[:-1]]

        # Random forest
        print("RF")
        clf=RandomForestClassifier()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        adasyn_results[N][k]["RF"]["F1"]=f1_score         
        adasyn_results[N][k]["RF"]["sensitivity"]=sensitivity                 
        adasyn_results[N][k]["RF"]["specificity"]=specificity

    #     Gaussian NB
        print("GNB")
        gnb = GaussianNB()
        gnb.fit(source.values,target)
        y_predict = gnb.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        adasyn_results[N][k]["GNB"]["F1"]=f1_score         
        adasyn_results[N][k]["GNB"]["sensitivity"]=sensitivity                 
        adasyn_results[N][k]["GNB"]["specificity"]=specificity

    #     SVM
        print("SVM")
        clf = svm.SVC()
        clf.fit(source.values,target)
        y_predict=clf.predict(test_features_only.values)
        sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
        adasyn_results[N][k]["SVM"]["F1"]=f1_score         
        adasyn_results[N][k]["SVM"]["sensitivity"]=sensitivity                 
        adasyn_results[N][k]["SVM"]["specificity"]=specificity        
        
        
        



        


In [None]:
# get best

print("randmx = ",randmx)
print("dist_percent = ",dist_percent)
import math
checks=["MLP","RF","GNB","SVM"]
# checks=["GNB"]

best_n=None
best_k=None
best_sensitivity=0
best_specificity=0
best_f1=0

print("KNNOR")


for check in checks:
    for k,v in results.items():
        if k=="0":
            continue
        for k1,v1 in v.items():
#             print(k1)
#             print(v1)
            if math.isnan(v1[check]["F1"]):
                continue
#             print(v1[check]["F1"])
            if best_f1<v1[check]["F1"]:
                best_f1=v1[check]["F1"]
                best_n=k
                best_k=k1
                best_sensitivity=v1[check]["sensitivity"]
                best_specificity=v1[check]["specificity"]
                

    print(check,best_f1,"[N=",best_n,",k=",best_k,"]")
    print("Sensitivity = ",best_sensitivity)
    print("Specificity = ",best_specificity)    
    
    best_f1=0
    best_k=None
    best_n=None
    best_sensitivity=0
    best_specificity=0



In [None]:
# get best

print("randmx = ",randmx)
print("dist_percent = ",dist_percent)
import math
checks=["MLP","RF","GNB","SVM"]
# checks=["GNB"]

best_n=None
best_k=None
best_sensitivity=0
best_specificity=0
best_f1=0

print("SMOTE")


for check in checks:
    for k,v in smote_results.items():
        if k=="0":
            continue
        for k1,v1 in v.items():
#             print(k1)
#             print(v1)
            if math.isnan(v1[check]["F1"]):
                continue
#             print(v1[check]["F1"])
            if best_f1<v1[check]["F1"]:
                best_f1=v1[check]["F1"]
                best_n=k
                best_k=k1
                best_sensitivity=v1[check]["sensitivity"]
                best_specificity=v1[check]["specificity"]                

    print(check,best_f1,"[N=",best_n,",k=",best_k,"]")
    print("Sensitivity = ",best_sensitivity)
    print("Specificity = ",best_specificity)
    
    best_f1=0
    best_k=None
    best_n=None
    best_sensitivity=0
    best_specificity=0
    



In [None]:
# get best

print("randmx = ",randmx)
print("dist_percent = ",dist_percent)
import math
checks=["MLP","RF","GNB","SVM"]
# checks=["GNB"]

best_n=None
best_k=None
best_sensitivity=0
best_specificity=0
best_f1=0

print("ADASYN")


for check in checks:
    for k,v in adasyn_results.items():
        if k=="0":
            continue
        for k1,v1 in v.items():
#             print(k1)
#             print(v1)
            if math.isnan(v1[check]["F1"]):
                continue
#             print(v1[check]["F1"])
            if best_f1<v1[check]["F1"]:
                best_f1=v1[check]["F1"]
                best_n=k
                best_k=k1
                best_sensitivity=v1[check]["sensitivity"]
                best_specificity=v1[check]["specificity"]                

    print(check,best_f1,"[N=",best_n,",k=",best_k,"]")
    print("Sensitivity = ",best_sensitivity)
    print("Specificity = ",best_specificity)
    
    best_f1=0
    best_k=None
    best_n=None
    best_sensitivity=0
    best_specificity=0
    



### Re train as required


In [31]:
to_break=False
while True:
    for dist_percent in [0.5]: 
        for randmx in [0.5]:
            N_range=[160]
            for N in N_range:
                results[N]={}
                smote_results[N]={}
                adasyn_results[N]={}

                k_range=[2]
                for k in k_range:
                    results[N][k]={}
                    smote_results[N][k]={}  
                    adasyn_results[N][k]={}

                    results[N][k]["MLP"]={}
                    smote_results[N][k]["MLP"]={}  
                    adasyn_results[N][k]["MLP"]={}

                    results[N][k]["RF"]={}
                    smote_results[N][k]["RF"]={}  
                    adasyn_results[N][k]["RF"]={}

                    results[N][k]["GNB"]={}
                    smote_results[N][k]["GNB"]={}  
                    adasyn_results[N][k]["GNB"]={}

                    results[N][k]["SVM"]={}
                    smote_results[N][k]["SVM"]={}  
                    adasyn_results[N][k]["SVM"]={}


                    class_index=1
    #                 randmx=.25
    #                 dist_percent=0.9


                    daug = data_augment.DataAugment()
                    print("randmx = ",randmx)
                    print("dist_percent = ",dist_percent)
            
                    now = time.time()
                    [Data_a,Ext_d,Ext_not]=daug.augment(data=train.values,k=k,class_ind=class_index,N=N,
                                                        randmx=randmx,dist_percent=dist_percent)
                    later = time.time()
                    difference = int(later - now)
                    print("Time taken to augment = ",difference)
                    print(len(Data_a))

                    train_aug=pd.DataFrame(data=Data_a,index=None,    # values                
                            columns=columns)      

                    print("After augmentation of ",N," items with ",k," neighbors")

#                     print(train_aug["Outcome"].value_counts())
#                     source=train_aug[train_aug.columns[:-1]]
#                     target = list(train_aug["Outcome"])
#                     target=pd.get_dummies(target)

#                     weight_path="weights/"+str(N)+"_"+str(k)+"_wt1.hdf5"    
#                     model,callbacks_list=create_model(weight_path,source.shape[1])
#                     start=time.time()
#                     print("Training model for N = ",N," k = ",k)
#                     history = model.fit(source.values, target,epochs=300,validation_split=0.2,callbacks=callbacks_list,verbose=0)
#                     end=time.time()
#                     difference = int(end - start)
#                     print("Time taken to train = ",difference)



#                     plt.plot(history.history['loss'])
#                     plt.plot(history.history['val_loss'])
#                     plt.title('model loss')
#                     plt.ylabel('loss')
#                     plt.xlabel('epoch')
#                     plt.legend(['train','validation'], loc='upper left')
#                     plt.show()    

#                     # load weights
#                     model.load_weights(weight_path)
#                     # Compile model (required to make predictions)
#                     opt=keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=False)
#                     model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

#                     print("used model and loaded weights from file")    
#                     print("Test distribution")
#                     print(test["Outcome"].value_counts())
#                     print("Trying MLP")
#                     y_actual = list(test["Outcome"])
#                     y_actual=pd.get_dummies(y_actual)
#                     test_features_only=test[test.columns[:-1]]
#                     _, accuracy = model.evaluate(test_features_only.values, y_actual)
#                     print('Accuracy: %.2f' % (accuracy*100))            


                    y_actual = test["Outcome"].astype(int)
                    y_actual=np.asarray(y_actual)
#                     y_predict=model.predict_classes(test_features_only.values)
#                     sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
#                     results[N][k]["MLP"]["F1"]=f1_score 
#                     results[N][k]["MLP"]["sensitivity"]=sensitivity 
#                     results[N][k]["MLP"]["specificity"]=specificity    




            #         # for other classifiers
                    target = list(train_aug["Outcome"])
                    source=train_aug[train_aug.columns[:-1]]
                    test_features_only=test[test.columns[:-1]]

            #         # Random forest
#                     print("RF")
#                     clf=RandomForestClassifier()
#                     clf.fit(source.values,target)
#                     y_predict=clf.predict(test_features_only.values)
#                     sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
#                     results[N][k]["RF"]["F1"]=f1_score 
#                     results[N][k]["RF"]["sensitivity"]=sensitivity 
#                     results[N][k]["RF"]["specificity"]=specificity         


            #     #     Gaussian NB
                    print("GNB")
                    gnb = GaussianNB()
                    gnb.fit(source.values,target)
                    y_predict = gnb.predict(test_features_only.values)
                    sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
                    results[N][k]["GNB"]["F1"]=f1_score 
                    results[N][k]["GNB"]["sensitivity"]=sensitivity 
                    results[N][k]["GNB"]["specificity"]=specificity         



            #     #     SVM
#                     print("SVM")
#                     clf = svm.SVC()
#                     clf.fit(source.values,target)
#                     y_predict=clf.predict(test_features_only.values)
#                     sensitivity,specificity,f1_score=check_cm_others(y_actual,y_predict)
#                     results[N][k]["SVM"]["F1"]=f1_score 
#                     results[N][k]["SVM"]["sensitivity"]=sensitivity 
#                     results[N][k]["SVM"]["specificity"]=specificity         



                    if f1_score>0.62:
                        to_break=True
                if to_break:
                    break
            if to_break:
                break
        if to_break:
            break
    if to_break:
        break



        


randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[84 31]
 [13 31]]
Accuracy :  0.7232704402515723
Sensitivity :  0.7045454545454546
Specificity :  0.7304347826086957
Precision =  0.5
Recall =  0.7045454545454546
F1 score =  0.5849056603773585
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[84 31]
 [12 32]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7272727272727273
Specificity :  0.7304347826086957
Precision =  0.5079365079365079
Recall =  0.7272727272727273
F1 score =  0.5981308411214953
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[84 31]
 [13 31]]
Accuracy :  0.7232704402515723
Sensitivity :  0.7045454545454546
Specificity :  0.7304347826086957
Precision =  0.5
Recall =  0.7045454545454546
F1

Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[84 31]
 [13 31]]
Accuracy :  0.7232704402515723
Sensitivity :  0.7045454545454546
Specificity :  0.7304347826086957
Precision =  0.5
Recall =  0.7045454545454546
F1 score =  0.5849056603773585
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [12 32]]
Accuracy :  0.7358490566037735
Sensitivity :  0.7272727272727273
Specificity :  0.7391304347826086
Precision =  0.5161290322580645
Recall =  0.7272727272727273
F1 score =  0.6037735849056604
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [13 31]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7045454545454546
Specificity :  0.7391304347826086
Precision =  0.5081967213114754
Recall =  0.7045454545454546
F1 score =  0.5904761

Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [13 31]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7045454545454546
Specificity :  0.7391304347826086
Precision =  0.5081967213114754
Recall =  0.7045454545454546
F1 score =  0.5904761904761906
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[83 32]
 [12 32]]
Accuracy :  0.7232704402515723
Sensitivity :  0.7272727272727273
Specificity :  0.7217391304347827
Precision =  0.5
Recall =  0.7272727272727273
F1 score =  0.5925925925925926
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [13 31]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7045454545454546
Specificity :  0.7391304347826086
Precision =  0.5081967213114754
Recall =  0.7045454545454546
F1 score =  0.5904761

Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [13 31]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7045454545454546
Specificity :  0.7391304347826086
Precision =  0.5081967213114754
Recall =  0.7045454545454546
F1 score =  0.5904761904761906
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[85 30]
 [13 31]]
Accuracy :  0.7295597484276729
Sensitivity :  0.7045454545454546
Specificity :  0.7391304347826086
Precision =  0.5081967213114754
Recall =  0.7045454545454546
F1 score =  0.5904761904761906
randmx =  0.5
dist_percent =  0.5
Time taken to augment =  0
769
After augmentation of  160  items with  2  neighbors
GNB
Confusion Matrix : 
 [[84 31]
 [13 31]]
Accuracy :  0.7232704402515723
Sensitivity :  0.7045454545454546
Specificity :  0.7304347826086957
Precision =  0.5
Recall =  0.7045454545454546
F1 score =  0.5849056

KeyboardInterrupt: 

In [None]:
results