In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
import time

In [10]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
all_data = pd.read_csv("Nomao.data", header = None)
all_data.replace('?', np.NaN, inplace=True)

all_data = all_data.drop(0, axis=1)

data_train, data_test, labels_train, labels_test = train_test_split(all_data.iloc[:,:118], all_data.iloc[:,[118]], test_size=0.3, random_state = 42, stratify=all_data.iloc[:,-1])

imp = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
cat_data_train = list ()
num_data_train = list ()
cat_data_test = list ()
num_data_test = list ()
cat_cols = [7, 8, 15, 16, 23, 24, 31, 32, 39, 40, 47, 48, 55, 56, 63, 64, 71, 72, 79, 80, 87, 88, 92, 96, 100, 104, 108, 112, 116] 
for col in cat_cols :
  cat_data_train.append(pd.DataFrame(imp.fit_transform(data_train.iloc[:,[col-1]])))
  cat_data_test.append(pd.DataFrame(imp.transform(data_test.iloc[:,[col-1]])))
cols = all_data.columns
num_cols = list(set (cols) - set (cat_cols) - {119})
imp = SimpleImputer(missing_values=np.NaN, strategy="mean")
for col in num_cols :
  num_data_train.append(pd.DataFrame(imp.fit_transform(data_train.iloc[:,[col-1]])))
  num_data_test.append(pd.DataFrame(imp.transform(data_test.iloc[:,[col-1]])))

j1 , j2 = 0, 0

idata_train = pd.DataFrame()
idata_test = pd.DataFrame()
for col in range(1,119) :
  if j2 < 29 and cat_cols[j2] == col :
    idata_train = pd.concat([idata_train,cat_data_train[j2]],join="outer",axis=1)
    idata_test = pd.concat([idata_test, cat_data_test[j2]],join="outer",axis=1)
    j2 = j2 + 1
  else :
    idata_train = pd.concat([idata_train,num_data_train[j1]],join="outer",axis=1)
    idata_test = pd.concat([idata_test,num_data_test[j1]],join="outer",axis=1)    
    j1 = j1 + 1

temp = pd.concat([idata_train,idata_test], join="outer", axis=0)
dummy_data = pd.get_dummies(temp, [i-1 for i in cat_cols])
np_dummy_data = dummy_data.apply(pd.to_numeric).values

np_data_train = np_dummy_data[:24125,:] #οι γραμμές δεν αλλάζουν θέση οπότε αφού σειριακά
np_data_test = np_dummy_data[24125:,:]  #ενώσαμε τα train, test σειριακά θα τα χωρίσουμε

mapping = {-1 : 0, +1 : 1}

labels_train[119] = labels_train[119].map(mapping)
labels_test[119] = labels_test[119].map(mapping)

np_labels_train = labels_train[119].values.flatten()
np_labels_test = labels_test[119].values.flatten()

X_train = np_data_train
y_train = np_labels_train
X_test = np_data_test
y_test = np_labels_test

In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
pred_gnb = gnb_model.predict(X_test)

f1_micro_score_gnb = cross_val_score(gnb, X_train, y_train, cv = 5, scoring ='f1_micro')
f1_macro_score_gnb = cross_val_score(gnb, X_train, y_train, cv = 5, scoring = "f1_macro")

print("The GaussianNB Classifier has f1_micro score on training set : %f \n" % (f1_micro_score_gnb.mean()))
print("The GaussianNB Classifier has f1_macro score on training set : %f \n" % (f1_macro_score_gnb.mean()))
print("The GaussianNB Classifier has scores on test set : \n" + classification_report(y_test,pred_gnb))

The GaussianNB Classifier has f1_micro score on training set : 0.828560 

The GaussianNB Classifier has f1_macro score on training set : 0.769438 

The GaussianNB Classifier has scores on test set : 
              precision    recall  f1-score   support

           0       0.77      0.55      0.64      2953
           1       0.84      0.93      0.88      7387

    accuracy                           0.82     10340
   macro avg       0.80      0.74      0.76     10340
weighted avg       0.82      0.82      0.81     10340



In [12]:
#pipe με default τιμές (αρχικοποίηση) και επιλεγμένο τον StandardScaler

selector = VarianceThreshold()
scaler = StandardScaler()
ros = RandomOverSampler()
pca = PCA()
gnb = GaussianNB()
pipe_gnb = Pipeline(steps=[('selector', selector), ('scaler', scaler), ('sampler', ros), ('pca', pca), ('gnb', gnb)])
pipe_gnb.fit(X_train, y_train)
pred_pipe_gnb = pipe_gnb.predict(X_test)
f1_micro_score_pipe_gnb = cross_val_score(pipe_gnb, X_train, y_train, cv = 5, scoring = 'f1_micro')
f1_macro_score_pipe_gnb = cross_val_score(pipe_gnb, X_train, y_train, cv = 5, scoring = 'f1_macro')

print("The GaussianNB Classifier with default preprocessing has f1_micro score on training set : %f \n" % (f1_micro_score_pipe_gnb.mean()))
print("The GaussianNB Classifier with default preprocessing has f1_macro score on training set : %f \n" % (f1_macro_score_pipe_gnb.mean()))
print("The GaussianNB Classifier with default preprocessing parameters has scores on test set: \n" + classification_report(y_test,pred_pipe_gnb))

The GaussianNB Classifier with default preprocessing has f1_micro score on training set : 0.651772 

The GaussianNB Classifier with default preprocessing has f1_macro score on training set : 0.623323 

The GaussianNB Classifier with default preprocessing parameters has scores on test set: 
              precision    recall  f1-score   support

           0       0.51      0.94      0.66      2953
           1       0.96      0.63      0.76      7387

    accuracy                           0.72     10340
   macro avg       0.74      0.79      0.71     10340
weighted avg       0.83      0.72      0.73     10340



In [13]:
#pipe με default τιμές (αρχικοποίηση) και επιλεγμένο τον MinMaxScaler

selector = VarianceThreshold()
mscaler = StandardScaler()
ros = RandomOverSampler()
pca = PCA()
gnb = GaussianNB()
mpipe_gnb = Pipeline(steps=[('scaler', mscaler), ('selector', selector), ('sampler', ros), ('pca', pca), ('gnb', gnb)])
mpipe_gnb.fit(X_train, y_train)
mpred_pipe_gnb = mpipe_gnb.predict(X_test)
mf1_micro_score_pipe_gnb = cross_val_score(mpipe_gnb, X_train, y_train, cv = 5, scoring = 'f1_micro')
mf1_macro_score_pipe_gnb = cross_val_score(mpipe_gnb, X_train, y_train, cv = 5, scoring = 'f1_macro')

print("The GaussianNB Classifier with default preprocessing (using MinMax Scaler) has f1_micro score on training set : %f \n" % (mf1_micro_score_pipe_gnb.mean()))
print("The GaussianNB Classifier with default preprocessing (using MinMax Scaler) has f1_macro score on training set : %f \n" % (mf1_macro_score_pipe_gnb.mean()))
print("The GaussianNB Classifier with default preprocessing parameters (using MinMax Scaler) has scores on test set: \n" + classification_report(y_test,mpred_pipe_gnb))

The GaussianNB Classifier with default preprocessing (using MinMax Scaler) has f1_micro score on training set : 0.681741 

The GaussianNB Classifier with default preprocessing (using MinMax Scaler) has f1_macro score on training set : 0.626196 

The GaussianNB Classifier with default preprocessing parameters (using MinMax Scaler) has scores on test set: 
              precision    recall  f1-score   support

           0       0.44      0.95      0.60      2953
           1       0.96      0.51      0.67      7387

    accuracy                           0.64     10340
   macro avg       0.70      0.73      0.64     10340
weighted avg       0.81      0.64      0.65     10340



In [14]:
#Grid Search Cross Validation με επιλεγμένο τον StandardScaler
vthreshold = [0.08, 0.088, 0.09, 0.095, 1e-1]
n_components = [1, 18, 20, 23, 25]

estimator = GridSearchCV(pipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_micro',n_jobs=-1)
start_time = time.time()
estimator.fit(X_train, y_train)
estimator_pred = estimator.predict(X_test)
estimator_time = time.time() - start_time

print("The GaussianNB Classifier has f1_micro score on training set : %f \n" % (estimator.best_score_))
print("The GaussianNB Classifier optimized by f1_micro : \n" + classification_report(y_test,estimator_pred))
print("with best parameters")
print(estimator.best_params_)

#pipe_gnb = estimator.best_estimator_
#pred_pipe_gnb = estimator_pred
#time_pipe_gnb = estimator_time

print("\n")

estimator = GridSearchCV(pipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_macro',n_jobs=-1)
start_time = time.time()
estimator.fit(X_train, y_train)
estimator_pred = estimator.predict(X_test)
estimator_time = time.time() - start_time

print("The GaussianNB Classifier has f1_macro score on training set : %f \n" % (estimator.best_score_))
print("The GaussianNB Classifier optimized by f1_macro : \n" + classification_report(y_test,estimator_pred))
print("with best parameters")
print(estimator.best_params_)

print("\n")

The GaussianNB Classifier has f1_micro score on training set : 0.908104 

The GaussianNB Classifier optimized by f1_micro : 
              precision    recall  f1-score   support

           0       0.79      0.91      0.85      2953
           1       0.96      0.90      0.93      7387

    accuracy                           0.91     10340
   macro avg       0.88      0.91      0.89     10340
weighted avg       0.91      0.91      0.91     10340

with best parameters
{'pca__n_components': 18, 'selector__threshold': 0.09}


The GaussianNB Classifier has f1_macro score on training set : 0.892444 

The GaussianNB Classifier optimized by f1_macro : 
              precision    recall  f1-score   support

           0       0.80      0.91      0.85      2953
           1       0.96      0.91      0.93      7387

    accuracy                           0.91     10340
   macro avg       0.88      0.91      0.89     10340
weighted avg       0.92      0.91      0.91     10340

with best paramete

In [15]:
#Grid Search Cross Validation με επιλεγμένο τον MinMaxScaler
vthreshold = [0.08, 0.088, 0.09, 0.095, 1e-1]
n_components = [1, 18, 20, 23, 25]

mestimator = GridSearchCV(mpipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_micro',n_jobs=-1)
start_time = time.time()
mestimator.fit(X_train, y_train)
mestimator_pred = mestimator.predict(X_test)
mestimator_time = time.time() - start_time

print("The GaussianNB Classifier (using MinMaxScaler) has f1_micro score on training set : %f \n" % (mestimator.best_score_))
print("The GaussianNB Classifier (using MinMaxScaler) optimized by f1_micro : \n" + classification_report(y_test,mestimator_pred))
print("with best parameters")
print(mestimator.best_params_)

#pipe_gnb = mestimator.best_estimator_
#pred_pipe_gnb = mestimator_pred
#time_pipe_gnb = mestimator_time

print("\n")

mestimator = GridSearchCV(mpipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_macro',n_jobs=-1)
start_time = time.time()
mestimator.fit(X_train, y_train)
mestimator_pred = mestimator.predict(X_test)
mestimator_time = time.time() - start_time

print("The GaussianNB Classifier (using MinMaxScaler) has f1_macro score on training set : %f \n" % (mestimator.best_score_))
print("The GaussianNB Classifier (using MinMaxScaler) optimized by f1_macro : \n" + classification_report(y_test,mestimator_pred))
print("with best parameters")
print(mestimator.best_params_)

print("\n")

The GaussianNB Classifier (using MinMaxScaler) has f1_micro score on training set : 0.865865 

The GaussianNB Classifier (using MinMaxScaler) optimized by f1_micro : 
              precision    recall  f1-score   support

           0       0.75      0.68      0.71      2953
           1       0.88      0.91      0.89      7387

    accuracy                           0.84     10340
   macro avg       0.81      0.79      0.80     10340
weighted avg       0.84      0.84      0.84     10340

with best parameters
{'pca__n_components': 23, 'selector__threshold': 0.1}


The GaussianNB Classifier (using MinMaxScaler) has f1_macro score on training set : 0.835984 

The GaussianNB Classifier (using MinMaxScaler) optimized by f1_macro : 
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      2953
           1       0.89      0.93      0.91      7387

    accuracy                           0.86     10340
   macro avg       0.84      0.82      0.83 

In [16]:
#Grid Search Cross Validation χωρίς Scaler
vthreshold = [0.08, 0.088, 0.09, 0.095, 1e-1]
n_components = [1, 18, 20, 23, 25]

pipe_gnb = Pipeline(steps=[('selector', selector), ('sampler', ros), ('pca', pca), ('gnb', gnb)])
estimator = GridSearchCV(pipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_micro',n_jobs=-1)
start_time = time.time()
estimator.fit(X_train, y_train)
estimator_pred = estimator.predict(X_test)
estimator_time = time.time() - start_time

print("The GaussianNB Classifier (with no Scaler applied) has f1_micro score on training set : %f \n" % (estimator.best_score_))
print("The GaussianNB Classifier (with no Scaler applied) optimized by f1_micro : \n" + classification_report(y_test,estimator_pred))
print("with best parameters")
print(estimator.best_params_)

#pipe_gnb = estimator.best_estimator_
#pred_pipe_gnb = estimator_pred
#time_pipe_gnb = estimator_time

print("\n")

pipe_gnb = Pipeline(steps=[('selector', selector), ('sampler', ros), ('pca', pca), ('gnb', gnb)])
estimator = GridSearchCV(pipe_gnb, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=5, scoring='f1_macro',n_jobs=-1)
start_time = time.time()
estimator.fit(X_train, y_train)
estimator_pred = estimator.predict(X_test)
estimator_time = time.time() - start_time

print("The GaussianNB Classifier (with no Scaler applied) has f1_macro score on training set : %f \n" % (estimator.best_score_))
print("The GaussianNB Classifier (with no Scaler applied) optimized by f1_macro : \n" + classification_report(y_test,estimator_pred))
print("with best parameters")
print(estimator.best_params_)

print("\n")

The GaussianNB Classifier (with no Scaler applied) has f1_micro score on training set : 0.914363 

The GaussianNB Classifier (with no Scaler applied) optimized by f1_micro : 
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      2953
           1       0.96      0.92      0.94      7387

    accuracy                           0.91     10340
   macro avg       0.89      0.91      0.90     10340
weighted avg       0.92      0.91      0.92     10340

with best parameters
{'pca__n_components': 18, 'selector__threshold': 0.088}


The GaussianNB Classifier (with no Scaler applied) has f1_macro score on training set : 0.897679 

The GaussianNB Classifier (with no Scaler applied) optimized by f1_macro : 
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      2953
           1       0.96      0.91      0.94      7387

    accuracy                           0.91     10340
   macro avg       0.89   