In [1]:
import pandas as pd
import numpy as np
import scipy.stats as s
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from Rashtriya_Raksha_University_Gaussian_NB import rru_gaussian_nb
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
class rru_gaussian_nb_scania(rru_gaussian_nb):
    
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels = data['class']
        imputer = SimpleImputer()
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        data_columns = data.columns
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list = list()
        
        for column in data.columns:
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        data = pd.DataFrame(data=data_array)
        data['class'] = data_labels
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [3]:
data = pd.read_csv("aps_failure_training_set.csv",header=None,skiprows=20)

  data = pd.read_csv("aps_failure_training_set.csv",header=None,skiprows=20)


In [4]:
column_names = data.iloc[0]

data = pd.read_csv("aps_failure_training_set.csv",header=None,skiprows=21,names=column_names)
data

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,neg,153002,na,664,186,0,0,0,0,0,...,998500,566884,1290398,1218244,1019768,717762,898642,28588,0,0
59996,neg,2286,na,2130706538,224,0,0,0,0,0,...,10578,6760,21126,68424,136,0,0,0,0,0
59997,neg,112,0,2130706432,18,0,0,0,0,0,...,792,386,452,144,146,2622,0,0,0,0
59998,neg,80292,na,2130706432,494,0,0,0,0,0,...,699352,222654,347378,225724,194440,165070,802280,388422,0,0


In [5]:
naive_bayes_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    for n_comp in np.arange(20,170,50):
        
        naive_bayes_configs[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,(0.8,0.2,0.0),True,n_comp)

In [6]:
naive_bayes_configs

{(0.7, 20): <__main__.rru_gaussian_nb_scania at 0x21aa05a4550>,
 (0.7, 70): <__main__.rru_gaussian_nb_scania at 0x21aa366be10>,
 (0.7, 120): <__main__.rru_gaussian_nb_scania at 0x21ac4b4ec10>,
 (0.7999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x21ad00d2090>,
 (0.7999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x21aadbed150>,
 (0.7999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x21ac70d5190>,
 (0.8999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x21ac7f089d0>,
 (0.8999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x21ac2a71d50>,
 (0.8999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x21ad005c3d0>,
 (0.9999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x21acf2ebad0>,
 (0.9999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x21ad0190390>,
 (0.9999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x21ad0193490>}

In [7]:
naive_bayes = list()
cv_data_list = list()

for obj in naive_bayes_configs.values():
    X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X=obj.X_new,y=data['class'])
    data_resampled = pd.DataFrame(data=X_resampled)
    data_resampled['class'] = data['class']
    train_data,cv_data,test_data = obj.data_splitting(data_resampled)
    cv_data_list.append(cv_data)
    naive_bayes.append(GaussianNB().fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),y=train_data['label']))

In [8]:
metrics = dict()
for obj,cv_data,config in tuple(zip(naive_bayes,cv_data_list,naive_bayes_configs.keys())):
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    precision = precision_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    recall = recall_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}


In [9]:
metrics

{(0.7, 20): {'accuracy': 0.9361016949152542, 'precision': 0.0, 'recall': 1.0},
 (0.7, 70): {'accuracy': 0.9513559322033899, 'precision': 0.0, 'recall': 1.0},
 (0.7, 120): {'accuracy': 0.9513559322033899, 'precision': 0.0, 'recall': 1.0},
 (0.7999999999999999, 20): {'accuracy': 0.9408474576271186,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 70): {'accuracy': 0.9533898305084746,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 120): {'accuracy': 0.956864406779661,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 20): {'accuracy': 0.9374576271186441,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 70): {'accuracy': 0.9509322033898305,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 120): {'accuracy': 0.9536440677966102,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 20): {'accuracy': 0.9141525423728813,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 70): {'accuracy': 0.9141525423728813,
  'precision': 

In [10]:
df = pd.DataFrame(metrics)
row_name = 'accuracy'
max_value = df.loc[row_name].max()
df_transposed = df.transpose()
max_accuracy_column = df_transposed['accuracy'].idxmax()
print("Column with the maximum value of accuracy:",max_value, max_accuracy_column)

Column with the maximum value of accuracy: 0.956864406779661 (0.7999999999999999, 120)
