In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from Rashtriya_Raksha_University_Gaussian_NB import rru_gaussian_nb
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [4]:
class rru_gaussian_nb_scania(rru_gaussian_nb):
    
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels = data['class']
        imputer = SimpleImputer()
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        data_columns = data.columns
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list = list()
        
        for column in data.columns:
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        data = pd.DataFrame(data=data_array)
        data['class'] = data_labels
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [5]:
data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)

  data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)


In [6]:
column_names = data.iloc[0]

data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=21,names=column_names)

In [None]:
logistic_regression_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    for n_comp in np.arange(20,170,50):
        
        logistic_regression_configs[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,
                                                                                     (0.7,0.2,0.0),
                                                                                     True,n_comp)

In [None]:
logistic_regression_configs

In [None]:
logistic_regression = dict()
cv_data_list = list()

for configs,obj in logistic_regression_configs.items():
    for reg_strength in [0.001,0.01,0.1,1,10,100,1000]:
    
        X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X=obj.X_new,y=data['class'])
        data_resampled = pd.DataFrame(data=X_resampled)
        data_resampled['class'] = data['class']
        train_data,cv_data,test_data = obj.data_splitting(data_resampled)
        cv_data_list.append(cv_data)
        k = tuple(list(configs)+[reg_strength])
        logistic_regression[k] = LogisticRegression(C=reg_strength,verbose=1,n_jobs=-1).fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),
                                                                        y=train_data['label'])

In [None]:
logistic_regression

In [None]:
metrics = dict()

for obj,cv_data,config in tuple(zip(logistic_regression.values(),cv_data_list,logistic_regression.keys())):
    
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    precision = precision_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    recall = recall_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}

In [None]:
metrics

#For test data after receiving the best configuration

#Testing test data from train data only.

In [7]:
logistic_regression_configs_best = dict()
non_na_thresh=0.7
n_comp=20
logistic_regression_configs_best[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,
                                                                                     (0.5,0.2,0.0),
                                                                                     True,n_comp)

In [47]:
logistic_regression_best = dict()
test_data_list = list()
reg_strength=0.001
for configs,obj in logistic_regression_configs_best.items():
        X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X=obj.X_new,y=data['class'])
        data_resampled = pd.DataFrame(data=X_resampled)
        data_resampled['class'] = data['class']
        train_data,cv_data,test_data = obj.data_splitting(data_resampled)
        test_data_list.append(test_data)
        k = tuple(list(configs)+[reg_strength])
        logistic_regression_best[k] = LogisticRegression(C=reg_strength,verbose=1,n_jobs=-1).fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),
                                                                        y=train_data['label'])


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.6s finished


In [52]:
metrics_test = dict()

for obj,cv_data,config in tuple(zip(logistic_regression_best.values(),test_data_list,logistic_regression_best.keys())):
    
    predicted_category = obj.predict(X=np.array(test_data.iloc[:,0:test_data.shape[1]-1]))
    acc = accuracy_score(y_true=np.array(test_data['label']),y_pred=predicted_category)
    precision = precision_score(y_true=np.array(test_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    recall = recall_score(y_true=np.array(test_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    metrics_test[config] = {'accuracy':acc,'precision':precision,'recall':recall}

In [53]:
metrics_test

{(0.7, 20, 0.001): {'accuracy': 0.9966101694915255,
  'precision': 0.0,
  'recall': 1.0}}