# The assignment here involves classifying whether the Air Pressure System (APS) Failure will happen for a Scania Truck or not. Based on the 170 different vitals on different trucks, we have to train a model to predict the failure of APS in any random scania truck. 

# This dataset is a perfect example of a kind of datasets which we encounter is real life situations or in industry and pose as a big challenge to solve because there are numerous problems with this dataset and is not ideal from many aspects such as: 

# $\bullet\quad$The dataset is highly imbalanced which messes up with the model training. 

# $\bullet\quad$There are lot missing values in this dataset in almost every column which makes it hard to refine the dataset. 

# $\bullet\quad$The dimensionality of the dataset is very much, that is the number of columns are 170 which is a rarely occuring phenomena and hence itself poses a challenge as to how to deal with such rare situations. 

# $\bullet\quad$The variation in values in several columns is very much. 

# $\bullet\quad$There are several columns which don't look like categorical because the magnitude of the values in those columns is huge but at the same time, these values are integer values signifying that we have to treat them as categorcial which itself poses a challenge as to how to process these values. 

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as s
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from Rashtriya_Raksha_University_Gaussian_NB import rru_gaussian_nb
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
class rru_gaussian_nb_scania(rru_gaussian_nb):
    
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels = data['class']
        imputer = SimpleImputer()
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        data_columns = data.columns
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list = list()
        
        for column in data.columns:
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        data = pd.DataFrame(data=data_array)
        data['class'] = data_labels
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [3]:
data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
column_names = data.iloc[0]

data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=21,names=column_names)

In [5]:
naive_bayes_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    for n_comp in np.arange(20,170,50):
        
        naive_bayes_configs[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,
                                                                             (0.8,0.2,0.0),True,n_comp)

In [6]:
naive_bayes_configs

{(0.7, 20): <__main__.rru_gaussian_nb_scania at 0x179204ad0d0>,
 (0.7, 70): <__main__.rru_gaussian_nb_scania at 0x1791ad10790>,
 (0.7, 120): <__main__.rru_gaussian_nb_scania at 0x179435a7fd0>,
 (0.7999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x1791ad04490>,
 (0.7999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x179204adb80>,
 (0.7999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x1794a1935b0>,
 (0.8999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x1794a193100>,
 (0.8999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x1794a1931f0>,
 (0.8999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x1794a193e20>,
 (0.9999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x1794a193ac0>,
 (0.9999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x1794946e3a0>,
 (0.9999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x1794946e670>}

In [7]:
naive_bayes = list()
cv_data_list = list()

for obj in naive_bayes_configs.values():
    X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_sample(X=obj.X_new,y=data['class'])
    data_resampled = pd.DataFrame(data=X_resampled)
    data_resampled['class'] = data['class']
    train_data,cv_data,test_data = obj.data_splitting(data_resampled)
    cv_data_list.append(cv_data)
    naive_bayes.append(GaussianNB().fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),y=train_data['label']))

In [9]:
metrics = dict()

for obj,cv_data,config in tuple(zip(naive_bayes,cv_data_list,naive_bayes_configs.keys())):
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    precision = precision_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    recall = recall_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}

In [10]:
metrics

{(0.7, 20): {'accuracy': 0.9361016949152542, 'precision': 0.0, 'recall': 1.0},
 (0.7, 70): {'accuracy': 0.9513559322033899, 'precision': 0.0, 'recall': 1.0},
 (0.7, 120): {'accuracy': 0.9513559322033899, 'precision': 0.0, 'recall': 1.0},
 (0.7999999999999999, 20): {'accuracy': 0.9408474576271186,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 70): {'accuracy': 0.9533898305084746,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 120): {'accuracy': 0.956864406779661,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 20): {'accuracy': 0.9374576271186441,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 70): {'accuracy': 0.9509322033898305,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 120): {'accuracy': 0.9536440677966102,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 20): {'accuracy': 0.9141525423728813,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 70): {'accuracy': 0.9141525423728813,
  'precision': 

# Using the metrics above, you can decide which configuration (which naive bayes classifier configuration in naive_bayes list)is working best for the case of Cross Validation Data.

# Use that trained configuration of naive bayes classifier in naive_bayes list to perform the prediction on testing data

# This is going to be your assignment. 