## Part I: Running KNNOR

In [None]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import collections
from knnor import data_augment

In [None]:
dataset = load_breast_cancer()

(unique, counts) = np.unique(dataset['target'], return_counts=True)

print('Unique values of the target variable', unique)
print('Counts of the target variable :', counts)


In [None]:
X=dataset["data"]
y=dataset["target"]

print("Original shape=",X.shape,y.shape)
elements_count = collections.Counter(y)
# printing the element and the frequency
print("Original distribution:")
for key, value in elements_count.items():
    print(f"{key}: {value}")



## Augmentation without any parameters

The algorithm calculates the parameters depending on the data

Final result will give an equal number of minority and majority data points


In [None]:
knnor=data_augment.KNNOR()
X_new,y_new,_,_=knnor.fit_resample(X,y)
print("Shape after augmentation",X_new.shape,y_new.shape)
elements_count = collections.Counter(y_new)
# printing the element and the frequency
print("Final distribution:")
for key, value in elements_count.items():
    print(f"{key}: {value}")


## Augmentation with user defined parameters


In [None]:
X_new,y_new,_,_=knnor.fit_resample(X,y,
                              num_neighbors=10, # the number of neighbors that will be used for generation of each artificial point
                              max_dist_point=0.01, # the maximum distance at which the new point will be placed
                              proportion_minority=0.3, # proportion of the minority population that will be used to generate the artificial point
                              final_proportion=2 # final number of minority datapoints
                               # example, if num majority =15 and num minority =5, 
#                                putting final_proportion as 1 will add 10 artificial minority points
                              )
print("Shape after augmentation",X_new.shape,y_new.shape)
elements_count = collections.Counter(y_new)
# printing the element and the frequency
print("Final distribution:")
for key, value in elements_count.items():
    print(f"{key}: {value}")


## Part II: Testing with benchmark datasets against state-of-art oversamplers

In [None]:
import numpy as np
import sys
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import os.path
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from smote_variant import MLPClassifierWrapper
from sklearn import svm
import pickle

import imbalanced_databases as imbd
# library containing the imbalanced datasets
# datasets will be present in the "/data" folder
# in case the data is not already there
# the above library will be used to download and save as pickle


import smote_variant as sv
# the library containing oversampling code
# includes first version of knnor as well
# as other state of art oversamplers

In [None]:
# Setting the cache_path which is used for caching during the evaluation
print("At",os.getcwd())
cache_path= os.path.join(os.path.dirname(os.getcwd()), 'results')
print(cache_path)
if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [None]:
# # Specifying the classifiers used for evaluation

# instantiate classifiers

# Support Vector Classifiers with 6 parameter combinations
sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                 CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),
                 CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),
                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),
                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),
                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]

# Multilayer Perceptron Classifiers with 6 parameter combinations
mlp_classifiers= []
for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):
    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))

# Nearest Neighbor Classifiers with 18 parameter combinations
nn_classifiers= []
for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):
    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))

# Decision Tree Classifiers with 6 parameter combinations
dt_classifiers= []
for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):
    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))

classifiers= []
classifiers.extend(sv_classifiers)
classifiers.extend(mlp_classifiers)
classifiers.extend(nn_classifiers)
classifiers.extend(dt_classifiers)



In [None]:
# uncomment from below list to include
# more augmentation algorithms
oversamplers= [    
    sv.SMOTE,
    sv.NoSMOTE,
    sv.KNNOR_SMOTE,
    sv.polynom_fit_SMOTE,
    sv.ProWSyn,
    sv.SMOTE_IPF,
    sv.Lee,
    sv.SMOBD,
    sv.G_SMOTE,
    sv.CCR,
    sv.LVQ_SMOTE,
    sv.Assembled_SMOTE,    
    sv.SMOTE_TomekLinks
]

In [None]:
# check if pickle file for data exists
if os.path.isfile('../data/sampled_datasets_below_1000.p'):
    sampled_datasets_below_1000 = pickle.load( open( "../data/sampled_datasets_below_1000.p", "rb" ) )
else:
    sampled_datasets_below_1000=[
        imbd.load_ecoli_0_1_3_7_vs_2_6,
        imbd.load_pima,
        imbd.load_cm1,
        imbd.load_vowel0,
        imbd.load_glass_0_1_6_vs_2,
        imbd.load_yeast_0_5_6_7_9_vs_4,
        imbd.load_yeast_1_vs_7,
        imbd.load_ecoli_0_3_4_7_vs_5_6,
        imbd.load_cleveland_0_vs_4,
        imbd.load_iris0,
        imbd.load_ecoli_0_6_7_vs_5,
        imbd.load_winequality_white_3_vs_7,
        imbd.load_ecoli_0_1_4_7_vs_2_3_5_6,
        imbd.load_ecoli_0_3_4_vs_5,
        imbd.load_glass0,
        imbd.load_habarman,
        imbd.load_glass_0_1_2_3_vs_4_5_6,
        imbd.load_ecoli_0_6_7_vs_3_5,
        imbd.load_poker_9_vs_7,
    ]
    pickle.dump( sampled_datasets_below_1000, open( "../data/sampled_datasets_below_1000.p", "wb" ) )

In [None]:
count_files=len(sampled_datasets_below_1000)
print("total files=",count_files)

# comment below lines when running full version
samples_used=1
sampled_datasets_below_1000=sampled_datasets_below_1000[:samples_used]
print("files used in this round of experiment",len(sampled_datasets_below_1000))

In [None]:
import time
start=int(time.time())
max_samp_par_comb=1 #50
# change above to a higher value
# for more trial runs


results= sv.evaluate_oversamplers(datasets= sampled_datasets_below_1000,
                                    samplers= oversamplers,
                                    classifiers=classifiers,
                                    cache_path= cache_path,
                                    n_jobs= 12,
                                    max_samp_par_comb= max_samp_par_comb)
duration=int(time.time())-start
print("Time taken = ",duration," seconds")
results.to_csv(os.path.join(cache_path,"Results.csv"),index=False)

In [None]:
results.head()

In [None]:
### Calculate time taken on average

In [None]:
df_speed=results.groupby('sampler')['runtime'].mean()
df_speed=df_speed.sort_values()
df_speed.head()

In [None]:
interesting_cols=["db_name","classifier","sampler","auc","f1","p_top20","gacc"]

df = results[interesting_cols]
df=df.replace("KNNOR_SMOTE","KNNOR")
samplers=df["sampler"].unique()
count_samplers=len(samplers)
print("Number of samplers:",count_samplers,"\n",samplers)

classifiers=df["classifier"].unique()
count_classifiers=len(classifiers)
print("Number of classifiers:",count_classifiers,"\n",classifiers)


db_names=df["db_name"].unique()
count_db_names=len(db_names)
print("Number of db_names:",count_db_names,"\n",db_names)


metrics=list(df.columns[-4:])
count_metrics=len(metrics)
print("Number of metrics:",count_metrics,"\n",metrics)

In [None]:
for metric in metrics:
#     df[metric+"_rank"]=[0 for i in range(df.shape[0])]
    df[metric+"_rank"]=df.groupby(['db_name','classifier'])[metric].rank(ascending=False,method="min")

In [None]:
df.to_excel(os.path.join(cache_path,"Ranks.xlsx"),engine='openpyxl',index=False)


In [None]:
# create dictionary
# Target output
'''

{
"knnor_smote":{
    "CalibratedClassifierCV":{
        "auc":{
            1:4,
            2:3,
            3:6,
            ...
        
            }
        "f1":{
            1:4,
            2:3,
            3:6,
            ...
        
            }
            ...
    
        }
        "DecisionTreeClassifier":{
        ...
        }

    }
"SMOTE_IPF""{

    }


}


'''

In [None]:
great_dict={}
for sampler in samplers:
    if sampler not in great_dict:
        great_dict[sampler]={}

    for classifier in classifiers:
        if classifier not in great_dict[sampler]:
            great_dict[sampler][classifier]={}
        for metric in metrics:
            if metric not in great_dict[sampler][classifier]:
                great_dict[sampler][classifier][metric]={}
            # create fillers for rank counts
            for i in range(1,len(samplers)+1):
                if i not in great_dict[sampler][classifier][metric]:
                    great_dict[sampler][classifier][metric][i]=0
                newdf = df[(df["sampler"] == sampler) & (df["classifier"] == classifier)]
            for index,row in newdf.iterrows():
                rank=int(row[metric+"_rank"])
                great_dict[sampler][classifier][metric][rank]+=1

                
            
            
                

### Following is to extract the top 5 oversamplers that were in top 3 consistently


In [None]:
fig, axs = plt.subplots(2,2,figsize=(10,10))
fig.suptitle('Count of rank for 4 metrics')
# for metric in metrics:
img_labels=['a','b','c','d']
for u in range(2):
    for v in range(2):
        metric=metrics[u*2+v]
#         print(metric)
        for sampler in samplers:
#             print(sampler)    
            rank_counter=[0 for i in range(len(samplers)+1)]
            for classifier in classifiers:
#                 print(classifier)
                for rank in range(1,len(samplers)+1):
                    rank_counter[rank]+=great_dict[sampler][classifier][metric][rank]
#             print(rank_counter)
            x=[i for i in range(len(rank_counter))]
            if sampler == "KNNOR":
                axs[u,v].plot(x[1:],rank_counter[1:], color='red',linewidth=1.2,linestyle='--',label="KNNOR")
            elif sampler == "polynom_fit_SMOTE":
                axs[u,v].plot(x[1:],rank_counter[1:], color='blue',linewidth=1.2,linestyle='--',label="polynom_fit_SMOTE")
            elif sampler == "ProWSyn":
                axs[u,v].plot(x[1:],rank_counter[1:], color='yellow',linewidth=1.2,linestyle='--',label="ProWSyn")
            elif sampler == "SMOTE_IPF":
                axs[u,v].plot(x[1:],rank_counter[1:], color='green',linewidth=1.2,linestyle='--',label="SMOTE_IPF")                
            elif sampler == "Lee":
                axs[u,v].plot(x[1:],rank_counter[1:], color='orange',linewidth=1.2,linestyle='--',label="Lee")                
                
#                 axs[u,v].legend()
            elif sampler == "SMOTE":
                axs[u,v].plot(x[1:],rank_counter[1:], color='black',linewidth=0.8,label="Others")        
            else:
                axs[u,v].plot(x[1:],rank_counter[1:], color='black',linewidth=0.8)
            axs[u,v].set_xlabel("Ranks")
            axs[u,v].set_ylabel("Frequency")
            axs[u,v].set_title(img_labels[u*2+v]+'. Ranking Frequency for metric '+str(metric))
        axs[u,v].legend()

fig.tight_layout(pad=1.0)
plt.savefig(os.path.join("../results","Running_All.svg"))
plt.savefig(os.path.join("../results","Running_All.jpg"))

plt.show()
    

### Calculate average ranking

In [None]:
samplers=df["sampler"].unique()
count_samplers=len(samplers)
print("Number of samplers:",count_samplers,"\n",samplers)

classifiers=df["classifier"].unique()
count_classifiers=len(classifiers)
print("Number of classifiers:",count_classifiers,"\n",classifiers)

db_names=df["db_name"].unique()
count_db_names=len(db_names)
print("Number of db_names:",count_db_names,"\n",db_names)

metrics=list(df.columns[-4:])
count_metrics=len(metrics)
print("Number of metrics:",count_metrics,"\n",metrics)

In [None]:
'''
samplers
classifiers
db_names
metrics
'''

In [None]:
my_dict={}
for metric in metrics:
    print(metric)
    my_dict[metric]={}
    
    for classifier in classifiers:    
        
        list_vals=[]
        print(classifier)        
        for sampler in samplers:        
            l=[sampler]
            newdf = df[(df["sampler"] == sampler) & (df["classifier"] == classifier)]
            avg=round(newdf[metric].mean(),4)
            l.append(avg)
            list_vals.append(l)
        list_vals=sorted(list_vals, key=lambda x: x[1],reverse=True)
        print(list_vals)
        my_dict[metric][classifier]=list_vals
        print()        

In [None]:
### Below dict has been used to keep top 10 results table
#### The big table with 4 sub tables etc etc

In [None]:
for metric in my_dict.keys():
    print(metric)
    for classifier in my_dict[metric]:
        print(classifier)
#         print(my_dict[metric][classifier])
        for i in range(0,len(my_dict[metric][classifier])):
            print(str(i+1)+","+str(my_dict[metric][classifier][i][0])+","+str(my_dict[metric][classifier][i][1]))
        vals=[-1,-1]
        for sampler,val in my_dict[metric][classifier]:
            if sampler == "SMOTE":
                vals[0]=val
            elif sampler =="NoSMOTE":
                vals[1]=val
        print("BL,SMOTE,",vals[0])
        print("BL,NoSMOTE,",vals[1])
        print("********************")
    print("*******________***********")

    

### Below is for top performer table

In [None]:
data={}
data["sampler"]=[]
for metric in metrics:
    data["avg_"+str(metric)]=[]
    

for sampler in samplers:
    print(sampler)
    data["sampler"].append(sampler)
    newdf = df[(df["sampler"] == sampler)]
    for metric in metrics:
        print(metric)
        avg=round(newdf[metric].mean(),4)
        print(avg)
        data["avg_"+str(metric)].append(avg)
    

In [None]:
avg_df=pd.DataFrame(data=data)

In [None]:
avg_df.head()
avg_df.to_excel(os.path.join(cache_path,'AvgRanks.xlsx'),engine='openpyxl',index=False)