In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from Gaussian_NB_PCA_Transform import gaussian_nb
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
class gaussian_nb_scania(gaussian_nb):
    
    def __init__(self,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components,apply_transform_or_not,n):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels = data['class']
        imputer = SimpleImputer()
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        data_columns = data.columns
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list = list()
        
        for column in data.columns:
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        data = pd.DataFrame(data=data_array)
        data['class'] = data_labels
        self.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components,apply_transform=apply_transform_or_not,division_number=n)

In [3]:
test_data = pd.read_csv("./aps_failure_test_set.csv",header=None,skiprows=20)
copy_data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)

  test_data = pd.read_csv("./aps_failure_test_set.csv",header=None,skiprows=20)
  copy_data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)


In [4]:
column_names_test=test_data.iloc[0]

test_data = pd.read_csv("./aps_failure_test_set.csv",header=None,skiprows=21,names=column_names_test)
print("test_data",test_data.shape)
column_names_copy = copy_data.iloc[0]

copy_data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=21,names=column_names_copy)
print("data",copy_data.shape)
main_data=pd.concat([copy_data, test_data], axis=0, ignore_index=True) 
main_data.shape

test_data (16000, 171)
data (60000, 171)


(76000, 171)

In [5]:
logistic_regression_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    for n_comp in np.arange(20,170,50):
        
        logistic_regression_configs[(non_na_thresh,n_comp)] = gaussian_nb_scania(main_data,non_na_thresh,
                                                                                     (0.5,0.2,0.0),
                                                                                     True,n_comp,True,60000)

In [6]:
logistic_regression_configs

{(0.7, 20): <__main__.gaussian_nb_scania at 0x25f789cec50>,
 (0.7, 70): <__main__.gaussian_nb_scania at 0x25f5c13c190>,
 (0.7, 120): <__main__.gaussian_nb_scania at 0x25f1916b5d0>,
 (0.7999999999999999, 20): <__main__.gaussian_nb_scania at 0x25f15796a90>,
 (0.7999999999999999, 70): <__main__.gaussian_nb_scania at 0x25f0bb79390>,
 (0.7999999999999999, 120): <__main__.gaussian_nb_scania at 0x25f0cbea490>,
 (0.8999999999999999, 20): <__main__.gaussian_nb_scania at 0x25f0afb2990>,
 (0.8999999999999999, 70): <__main__.gaussian_nb_scania at 0x25f19b9b090>,
 (0.8999999999999999, 120): <__main__.gaussian_nb_scania at 0x25f1767ea10>,
 (0.9999999999999999, 20): <__main__.gaussian_nb_scania at 0x25f14eb3d10>,
 (0.9999999999999999, 70): <__main__.gaussian_nb_scania at 0x25f16bf7290>,
 (0.9999999999999999, 120): <__main__.gaussian_nb_scania at 0x25f14578110>}

In [7]:
logistic_regression = dict()
cv_data_dict = {}
test_data_dict = {}
#from sklearn.preprocessing import LabelEncoder
for configs,obj in logistic_regression_configs.items():
    for reg_strength in [0.001,0.01,0.1,1,10,100,1000]:
    
        X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X=obj.X_new,y=copy_data['class'])
        if len(X_resampled) == 0:
           print("No resampled data is available.")
        else:
    # Convert the resampled data to a DataFrame
    

         data_resampled = pd.DataFrame(data=X_resampled)
        #label_encoder = LabelEncoder()
        #data_resampled['label'] = label_encoder.fit_transform(y_resampled)
         data_resampled['label'] =copy_data['class']
         train_data,cv_data,test_data = obj.data_splitting(data_resampled)
         cv_data_dict[(configs[0],configs[1],reg_strength)]=cv_data
         test_data_dict[(configs[0],configs[1],reg_strength)]=test_data
         k = tuple(list(configs)+[reg_strength])
         logistic_regression[k] = LogisticRegression(C=reg_strength,verbose=1,n_jobs=-1).fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),
                                                                        y=train_data['label'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

In [8]:
logistic_regression

{(0.7, 20, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 (0.7, 20, 0.01): LogisticRegression(C=0.01, n_jobs=-1, verbose=1),
 (0.7, 20, 0.1): LogisticRegression(C=0.1, n_jobs=-1, verbose=1),
 (0.7, 20, 1): LogisticRegression(C=1, n_jobs=-1, verbose=1),
 (0.7, 20, 10): LogisticRegression(C=10, n_jobs=-1, verbose=1),
 (0.7, 20, 100): LogisticRegression(C=100, n_jobs=-1, verbose=1),
 (0.7, 20, 1000): LogisticRegression(C=1000, n_jobs=-1, verbose=1),
 (0.7, 70, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 (0.7, 70, 0.01): LogisticRegression(C=0.01, n_jobs=-1, verbose=1),
 (0.7, 70, 0.1): LogisticRegression(C=0.1, n_jobs=-1, verbose=1),
 (0.7, 70, 1): LogisticRegression(C=1, n_jobs=-1, verbose=1),
 (0.7, 70, 10): LogisticRegression(C=10, n_jobs=-1, verbose=1),
 (0.7, 70, 100): LogisticRegression(C=100, n_jobs=-1, verbose=1),
 (0.7, 70, 1000): LogisticRegression(C=1000, n_jobs=-1, verbose=1),
 (0.7, 120, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 

In [9]:
Metrics={}
def result(data,model):
  metrics = dict()
  predicted_category = model.predict(X=np.array(data.iloc[:,0:data.shape[1]-1]))
  acc = accuracy_score(y_true=np.array(data['label']),y_pred=predicted_category)
  precision = precision_score(y_true=np.array(data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
  recall = recall_score(y_true=np.array(data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
  metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}
  Metrics.update(metrics)
  print(metrics)

In [10]:
for config,obj in logistic_regression.items():
    cv_data=cv_data_dict[config]
    result(cv_data,obj)


{(0.7, 20, 0.001): {'accuracy': 0.9961864406779661, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 0.01): {'accuracy': 0.9930932203389831, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 0.1): {'accuracy': 0.9923305084745763, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 1): {'accuracy': 0.9921610169491526, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 10): {'accuracy': 0.9921610169491526, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 100): {'accuracy': 0.9921610169491526, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 20, 1000): {'accuracy': 0.9921610169491526, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 70, 0.001): {'accuracy': 0.9960169491525424, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 70, 0.01): {'accuracy': 0.9929237288135593, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 70, 0.1): {'accuracy': 0.9922033898305085, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 70, 1): {'accuracy': 0.9916525423728814, 'precision': 0.0, 'recall': 1.0}}
{(0.7, 70, 10): {'accuracy': 0.9916101694915255, 'precision': 0.0, 'recall

# Using the metrics above, you can decide which configuration (which logistic regression classifier configuration in logistic_regression list)is working best for the case of Cross Validation Data.

# Use that trained configuration of logistic regression classifier in logistic_regression list to perform the prediction on testing data. 

# This is going to be your assignment. 

Selecting Best Configuration and running it for test_data from train_data

In [11]:
results_df = pd.DataFrame.from_dict(Metrics, orient='index').reset_index()
results_df
max_recall = results_df[results_df['recall'] == max(results_df['recall'])]
print("Recall",max_recall)
max_accuracy = results_df[results_df['accuracy'] == max(results_df['accuracy'])]
print("Accuracy",max_accuracy)
max_precision = results_df[results_df['precision'] == max(results_df['precision'])]
print("Precision",max_precision)

Recall     level_0  level_1   level_2  accuracy  precision  recall
0       0.7       20     0.001  0.996186        0.0     1.0
1       0.7       20     0.010  0.993093        0.0     1.0
2       0.7       20     0.100  0.992331        0.0     1.0
3       0.7       20     1.000  0.992161        0.0     1.0
4       0.7       20    10.000  0.992161        0.0     1.0
..      ...      ...       ...       ...        ...     ...
79      1.0      120     0.100  1.000000        1.0     1.0
80      1.0      120     1.000  1.000000        1.0     1.0
81      1.0      120    10.000  1.000000        1.0     1.0
82      1.0      120   100.000  1.000000        1.0     1.0
83      1.0      120  1000.000  1.000000        1.0     1.0

[84 rows x 6 columns]
Accuracy     level_0  level_1   level_2  accuracy  precision  recall
63      1.0       20     0.001       1.0        1.0     1.0
64      1.0       20     0.010       1.0        1.0     1.0
65      1.0       20     0.100       1.0        1.0     1.0
6

In [12]:
best_config=(0.9999999999999999, 120, 1000)
best_model = logistic_regression[best_config]
best_test_data=test_data_dict[best_config]
result(best_test_data,best_model)

{(0.9999999999999999, 120, 1000): {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0}}


In [17]:
best_config[0]

0.9999999999999999

## FOR MAIN TEXT DATA 

In [14]:
main_test_data=main_data.iloc[copy_data.shape[0]:]
main_test_data.reset_index(drop=True, inplace=True)

main_test_data_labels=main_test_data[['class']]
main_test_data_labels.reset_index(drop=True, inplace=True)
main_test_data

Unnamed: 0,class,aa_000
0,neg,60
1,neg,82
2,neg,66002
3,neg,59816
4,neg,1814
...,...,...
15995,neg,81852
15996,neg,18
15997,neg,79636
15998,neg,110


In [20]:
Transformed_test_data=logistic_regression_configs[(best_config[0],best_config[1])].transform(logistic_regression_configs[(best_config[0],best_config[1])].transform_data) 
#Transformed_test_data=best.transform(best.transform_data)

In [21]:
test_data_new=pd.DataFrame(data=Transformed_test_data)
test_data_new=pd.concat([test_data_new,main_test_data_labels],axis=1)
test_data_new.shape

(16000, 11)

In [23]:
test_data_new = test_data_new.rename(columns={'class': 'label'})
result(test_data_new,best_model)

{(0.9999999999999999, 120, 1000): {'accuracy': 0.9765625, 'precision': 1.0, 'recall': 0.0}}
