#*AUTHORS : COUTAREL Allan, DEVOUCOUX Maxime*

## IMPORT OF LIBRARIES AND METHOD DEFINITIONS

In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import random
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
le = LabelEncoder()

In [None]:
def getKNeighborsClassifier(X_train, y_train) :
  """
    KNeighbors Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn KNeighbors model
      Model after training
  """
  model = KNeighborsClassifier()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [None]:
def getSVClassifier(X_train, y_train):
  """
    Support Vector Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Support Vector model
      Model after training
  """
  model = svm.SVC(kernel='linear')
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [None]:
def getGaussianNBClassifier(X_train, y_train):
  """
    Gaussian Naive Bayes Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Gaussian Naive Bayes model
      Model after training
  """
  model = GaussianNB()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [None]:
def getLogisticRegressionClassifier(X_train, y_train):
  """
    Logistic Regression

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Logistic Regression model
      Model after training
  """
  model = LogisticRegression()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [None]:
def evaluateModel(model, X_test, y_test, isBinary):
  """
    Evaluates the model with 5 metrics and returns expected values and predicted values in a dataframe

    Parameters
    ----------
    model : sklearn model
      model used to make the predictions
    X_test : dataset
      testing features
    y_test : dataset
      testing labels
    isBinary : boolean
      True for binary classes, False for multi classes

    Returns
    -------
    dataframe
      expected values and predicted values in a dataframe
  """
  y_pred = pd.DataFrame(model.predict(X_test))
  print("Accuracy :", metrics.accuracy_score(y_test, y_pred))
  print("Balanced accuracy :", metrics.balanced_accuracy_score(y_test, y_pred))
  print("Confusion matrix :", metrics.confusion_matrix(y_test, y_pred))
  if isBinary:
    print("Precision :", metrics.precision_score(y_test, y_pred))
    print("Recall :", metrics.recall_score(y_test, y_pred))
  else:
    print("Precision :", metrics.precision_score(y_test, y_pred, average='macro', zero_division=0))
    print("Recall :", metrics.recall_score(y_test, y_pred, average='macro', zero_division=0))
  return get_Results_Dataframe(y_pred, y_test)

In [None]:
def get_Results_Dataframe(predict, y_test):
  """
    Makes a dataframe with the expected values and the predicted values 

    Parameters
      ----------
      predict : dataset
        predicted values
      y_test : dataset
        testing labels = expected values
    
    Returns
    -------
    dataframe
      expected values and predicted values in a dataframe
  """
  predict.set_index(y_test.index, inplace=True)
  result = pd.concat([y_test, predict], axis=1)
  result.rename({result.columns[0]: 'y_test', result.columns[1]: 'y_pred'}, axis=1, inplace=True)
  return result

In [None]:
def printResults(data, nbRowsMin, nbRowsMax):
  """
    Prints the dataframe and the occurrence of dataset elements

    Parameters
      ----------
      data : dataset
        dataset to print
      nbRowsMin : int
        minimum of rows to print
      nbRowsMax : int
        maximum of rows to print
      
    Returns
    -------
  """
  pd.set_option("display.min_rows", nbRowsMin, "display.max_rows", nbRowsMax, "display.max_columns", None)
  print(data)

## H1N1 DATASET

In [None]:
"""
H1N1 flu vaccines dataset import
"""
X_vaccines = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/vaccines_data/training_set_features.csv')  
y_vaccines = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/vaccines_data/training_set_labels.csv')
dataset_vaccines = pd.merge(X_vaccines, y_vaccines, on=['respondent_id','respondent_id'])

In [None]:
"""
Counts the occurrences of each class
"""
y_vaccines['h1n1_vaccine'].value_counts()

0    21033
1     5674
Name: h1n1_vaccine, dtype: int64

In [None]:
"""
Converts categorical variables into numbers
"""
dataset_vaccines['sex'] = le.fit_transform(dataset_vaccines['sex'])
dataset_vaccines['race'] = le.fit_transform(dataset_vaccines['race'])
dataset_vaccines['age_group'] = le.fit_transform(dataset_vaccines['age_group'])
dataset_vaccines['education'] = le.fit_transform(dataset_vaccines['education'])
dataset_vaccines['income_poverty'] = le.fit_transform(dataset_vaccines['income_poverty'])
dataset_vaccines['marital_status'] = le.fit_transform(dataset_vaccines['marital_status'])
dataset_vaccines['rent_or_own'] = le.fit_transform(dataset_vaccines['rent_or_own'])
dataset_vaccines['employment_status'] = le.fit_transform(dataset_vaccines['employment_status'])
dataset_vaccines['census_msa'] = le.fit_transform(dataset_vaccines['census_msa'])
dataset_vaccines['hhs_geo_region'] = le.fit_transform(dataset_vaccines['hhs_geo_region'])
dataset_vaccines['employment_industry'] = le.fit_transform(dataset_vaccines['employment_industry'])
dataset_vaccines['employment_occupation'] = le.fit_transform(dataset_vaccines['employment_occupation'])

"""
Deletes dataset lines which contains missing values (NaN) and splitting features and labels
"""
dataset_vaccines = dataset_vaccines.dropna()
y_vaccines = dataset_vaccines.pop('h1n1_vaccine')
X_vaccines = dataset_vaccines

In [None]:
"""
Train/test split and removal of outliers + model training and testing 
"""
X_train_vaccines, X_test_vaccines, y_train_vaccines, y_test_vaccines = train_test_split(X_vaccines, y_vaccines, test_size=0.3, random_state=42)

IF = IsolationForest(random_state=42)
IF.fit(X_train_vaccines)
y_pred = IF.predict(X_test_vaccines)
X_test_vaccines = X_test_vaccines[y_pred != -1]
y_test_vaccines = y_test_vaccines[y_pred != -1]

scaler = StandardScaler().fit(X_train_vaccines)
X_train_vaccines = scaler.transform(X_train_vaccines)
X_test_vaccines = scaler.transform(X_test_vaccines)

"""
KNN Classifier for H1N1 vaccines
"""
print("\nKNN CLASSIFIER :\n")
KNNmodel = getKNeighborsClassifier(X_train_vaccines, y_train_vaccines)
KNN = evaluateModel(KNNmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(KNN, 0, 15)

"""
SVC Classifier for H1N1 vaccines
"""
print("\nSVC CLASSIFIER :\n")
SVCmodel = getSVClassifier(X_train_vaccines, y_train_vaccines)
SVC = evaluateModel(SVCmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(SVC, 0, 15)

"""
GNB Classifier for H1N1 vaccines
"""
print("\nGNB CLASSIFIER :\n")
GNBmodel = getGaussianNBClassifier(X_train_vaccines, y_train_vaccines)
GNB = evaluateModel(GNBmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(GNB, 0, 15)

"""
LR Classifier for H1N1 vaccines
"""
print("\nLR CLASSIFIER :\n")
LRmodel = getLogisticRegressionClassifier(X_train_vaccines, y_train_vaccines)
LR = evaluateModel(LRmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(LR, 0, 15)


KNN CLASSIFIER :

Accuracy of the model (with training data) : 0.8537127141950497
Accuracy : 0.808532249873032
Balanced accuracy : 0.7019666720770438
Confusion matrix : [[1348  116]
 [ 261  244]]
Precision : 0.6777777777777778
Recall : 0.48316831683168315

SVC CLASSIFIER :

Accuracy of the model (with training data) : 0.8422889782102814
Accuracy : 0.8481462671406805
Balanced accuracy : 0.7655744467889412
Confusion matrix : [[1369   95]
 [ 204  301]]
Precision : 0.76010101010101
Recall : 0.596039603960396

GNB CLASSIFIER :

Accuracy of the model (with training data) : 0.7743812143008251
Accuracy : 0.7836465210766886
Balanced accuracy : 0.7442514743277606
Confusion matrix : [[1208  256]
 [ 170  335]]
Precision : 0.5668358714043993
Recall : 0.6633663366336634

LR CLASSIFIER :

Accuracy of the model (with training data) : 0.8416543262111276
Accuracy : 0.845606907059421
Balanced accuracy : 0.7619210896499486
Confusion matrix : [[1367   97]
 [ 207  298]]
Precision : 0.7544303797468355
Recal

In [None]:
"""
KNN Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.80643967 0.80451685 0.79340985 0.80007405 0.79526101]
Mean Accuracy : 0.7999402858441825
Balanced accuracy : [0.73953533 0.73412831 0.73479683 0.73763398 0.7308517 ]
Mean Balanced accuracy : 0.7353892316771897
Precision : [0.69692308 0.70253165 0.7012987  0.6958457  0.70757576]
Mean Precision : 0.7008349757393065
Recall : [0.58151476 0.56632653 0.58064516 0.58333333 0.56469166]
Mean Recall : 0.5753022888684064


In [None]:
"""
SVC Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.85418209 0.84376157 0.83524621 0.8430211  0.8374676 ]
Mean Accuracy : 0.8427357140253726
Balanced accuracy : [0.80362372 0.78929164 0.78715978 0.79687389 0.79200644]
Mean Balanced accuracy : 0.7937910951787528
Precision : [0.78267254 0.76934524 0.7745098  0.76462396 0.76648352]
Mean Precision : 0.7715270108627739
Recall : [0.68421053 0.65943878 0.66069295 0.68283582 0.67472793]
Mean Recall : 0.6723812012044833


In [None]:
"""
GNB Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.77831236 0.77823029 0.77304702 0.77378749 0.77082562]
Mean Accuracy : 0.7748405544345639
Balanced accuracy : [0.76368593 0.75782606 0.75888109 0.75761841 0.75681669]
Mean Balanced accuracy : 0.758965635550126
Precision : [0.59414226 0.59978425 0.61382114 0.60041623 0.60569106]
Mean Precision : 0.6027709875792789
Recall : [0.72913992 0.70918367 0.72162485 0.71766169 0.72067715]
Mean Recall : 0.7196574569917866


In [None]:
"""
LR Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.85418209 0.84228064 0.83561644 0.84116994 0.83672714]
Mean Accuracy : 0.8419952475314816
Balanced accuracy : [0.80209637 0.78824834 0.78940281 0.79412276 0.79113504]
Mean Balanced accuracy : 0.7930010628233763
Precision : [0.78603269 0.7647929  0.77103448 0.76223776 0.76510989]
Mean Precision : 0.769841544792956
Recall : [0.67907574 0.65943878 0.66786141 0.6778607  0.67351874]
Mean Recall : 0.6715510724785754


In [None]:
"""
KNN Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.79348631 0.80673825 0.78711588 0.79711218 0.80932988]
Mean Accuracy : 0.7987564986074462
Balanced accuracy : [0.7243532  0.73785232 0.72351166 0.73170515 0.74718569]
Mean Balanced accuracy : 0.7329216038918581
Precision : [0.69362364 0.72539683 0.66960352 0.69545455 0.71879699]
Mean Precision : 0.7005751053505879
Recall : [0.55266419 0.56699752 0.56575682 0.56947891 0.59305211]
Mean Recall : 0.5695899096306819


In [None]:
"""
SVC Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.83604737 0.85042577 0.83783784 0.84450204 0.84116994]
Mean Accuracy : 0.8419965903463668
Balanced accuracy : [0.78422151 0.80071495 0.78604006 0.79506734 0.79411865]
Mean Balanced accuracy : 0.7920325003434442
Precision : [0.76224784 0.79130435 0.76589595 0.7765043  0.76363636]
Mean Precision : 0.771917760366132
Recall : [0.65551425 0.67741935 0.65756824 0.67245658 0.67741935]
Mean Recall : 0.6680755547765981


In [None]:
"""
GNB Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.77720207 0.78119215 0.76897445 0.77378749 0.77156609]
Mean Accuracy : 0.7745444500501638
Balanced accuracy : [0.76147823 0.76420841 0.75264933 0.75465342 0.76055671]
Mean Balanced accuracy : 0.7587092189529507
Precision : [0.60665973 0.61327713 0.5942029  0.6031746  0.59516616]
Mean Precision : 0.6024961056281783
Recall : [0.72242875 0.72208437 0.71215881 0.70719603 0.73325062]
Mean Recall : 0.7194237149507566


In [None]:
"""
LR Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

Accuracy : [0.83197631 0.85042577 0.83820807 0.8430211  0.8430211 ]
Mean Accuracy : 0.8413304719501042
Balanced accuracy : [0.77989622 0.80000196 0.78559092 0.79365543 0.79722039]
Mean Balanced accuracy : 0.7912729851515055
Precision : [0.75322812 0.79300292 0.76855895 0.77285714 0.76527778]
Mean Precision : 0.7705849817136761
Recall : [0.65055762 0.67493797 0.65508685 0.67121588 0.68362283]
Mean Recall : 0.6670842288782091


## WATER PUMPS DATASET

In [None]:
"""
water pumps dataset import
"""
X_water_pumps = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/water_pumps_data/training_set_values.csv')  
y_water_pumps = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/water_pumps_data/training_set_labels.csv')
dataset_water_pumps = pd.merge(X_water_pumps, y_water_pumps, on=['id','id'])

In [None]:
"""
Counts the occurrences of each class
"""
y_water_pumps['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [None]:
"""
Converts categorical variables into numbers
"""
dataset_water_pumps['date_recorded'] = le.fit_transform(dataset_water_pumps['date_recorded'])
dataset_water_pumps['funder'] = le.fit_transform(dataset_water_pumps['funder'])
dataset_water_pumps['installer'] = le.fit_transform(dataset_water_pumps['installer'])
dataset_water_pumps['wpt_name'] = le.fit_transform(dataset_water_pumps['wpt_name'])
dataset_water_pumps['basin'] = le.fit_transform(dataset_water_pumps['basin'])
dataset_water_pumps['subvillage'] = le.fit_transform(dataset_water_pumps['subvillage'])
dataset_water_pumps['region'] = le.fit_transform(dataset_water_pumps['region'])
dataset_water_pumps['lga'] = le.fit_transform(dataset_water_pumps['lga'])
dataset_water_pumps['ward'] = le.fit_transform(dataset_water_pumps['ward'])
dataset_water_pumps['public_meeting'] = le.fit_transform(dataset_water_pumps['public_meeting'])
dataset_water_pumps['recorded_by'] = le.fit_transform(dataset_water_pumps['recorded_by'])
dataset_water_pumps['scheme_management'] = le.fit_transform(dataset_water_pumps['scheme_management'])
dataset_water_pumps['scheme_name'] = le.fit_transform(dataset_water_pumps['scheme_name'])
dataset_water_pumps['permit'] = le.fit_transform(dataset_water_pumps['permit'])
dataset_water_pumps['extraction_type'] = le.fit_transform(dataset_water_pumps['extraction_type'])
dataset_water_pumps['extraction_type_group'] = le.fit_transform(dataset_water_pumps['extraction_type_group'])
dataset_water_pumps['extraction_type_class'] = le.fit_transform(dataset_water_pumps['extraction_type_class'])
dataset_water_pumps['management'] = le.fit_transform(dataset_water_pumps['management'])
dataset_water_pumps['management_group'] = le.fit_transform(dataset_water_pumps['management_group'])
dataset_water_pumps['payment'] = le.fit_transform(dataset_water_pumps['payment'])
dataset_water_pumps['payment_type'] = le.fit_transform(dataset_water_pumps['payment_type'])
dataset_water_pumps['water_quality'] = le.fit_transform(dataset_water_pumps['water_quality'])
dataset_water_pumps['quality_group'] = le.fit_transform(dataset_water_pumps['quality_group'])
dataset_water_pumps['quantity'] = le.fit_transform(dataset_water_pumps['quantity'])
dataset_water_pumps['quantity_group'] = le.fit_transform(dataset_water_pumps['quantity_group'])
dataset_water_pumps['source'] = le.fit_transform(dataset_water_pumps['source'])
dataset_water_pumps['source_type'] = le.fit_transform(dataset_water_pumps['source_type'])
dataset_water_pumps['source_class'] = le.fit_transform(dataset_water_pumps['source_class'])
dataset_water_pumps['waterpoint_type'] = le.fit_transform(dataset_water_pumps['waterpoint_type'])
dataset_water_pumps['waterpoint_type_group'] = le.fit_transform(dataset_water_pumps['waterpoint_type_group'])
dataset_water_pumps['status_group'] = le.fit_transform(dataset_water_pumps['status_group'])

"""
Deletes dataset lines which contains missing values (NaN) and splitting features and labels
"""
dataset_water_pumps = dataset_water_pumps.dropna()
y_water_pumps = dataset_water_pumps.pop('status_group')
X_water_pumps = dataset_water_pumps

In [None]:
"""
Train/test split and removal of outliers + model training and testing 
"""
X_train_wp, X_test_wp, y_train_wp, y_test_wp = train_test_split(X_water_pumps, y_water_pumps, test_size=0.3, random_state=42)

IF = IsolationForest(random_state=42)
IF.fit(X_train_wp)
y_pred = IF.predict(X_test_wp)
X_test_wp = X_test_wp[y_pred != -1]
y_test_wp = y_test_wp[y_pred != -1]

scaler = StandardScaler().fit(X_train_wp)
X_train_wp = scaler.transform(X_train_wp)
X_test_wp = scaler.transform(X_test_wp)

"""
KNN Classifier for water pumps
"""
print("\nKNN CLASSIFIER :\n")
KNNmodel = getKNeighborsClassifier(X_train_wp, y_train_wp)
KNN = evaluateModel(KNNmodel, X_test_wp, y_test_wp, False)
#printResults(KNN, 0, 15)

"""
SVC Classifier for water pumps
"""
print("\nSVC CLASSIFIER :\n")
X_train_subset = X_train_wp[:int(X_train_wp.shape[0]*0.5),:]
y_train_subset = y_train_wp[:int(y_train_wp.shape[0]*0.5)]
SVCmodel = getSVClassifier(X_train_subset, y_train_subset)
SVC = evaluateModel(SVCmodel, X_test_wp, y_test_wp, False)
#printResults(SVC, 0, 15)

"""
GNB Classifier for water pumps
"""
print("\nGNB CLASSIFIER :\n")
GNBmodel = getGaussianNBClassifier(X_train_wp, y_train_wp)
GNB = evaluateModel(GNBmodel, X_test_wp, y_test_wp, False)
#printResults(GNB, 0, 15)

"""
LR Classifier for water pumps
"""
print("\nLR CLASSIFIER :\n")
LRmodel = getLogisticRegressionClassifier(X_train_wp, y_train_wp)
LR = evaluateModel(LRmodel, X_test_wp, y_test_wp, False)
#printResults(LR, 0, 15)


KNN CLASSIFIER :

Accuracy of the model (with training data) : 0.7609187109187109
Accuracy : 0.7426775253607658
Balanced accuracy : 0.5703772521417362
Confusion matrix : [[7199  111  684]
 [ 688  225  187]
 [1876   56 2972]]
Precision : 0.6949031829522947
Recall : 0.5703772521417362

SVC CLASSIFIER :

Accuracy of the model (with training data) : 0.6375180375180375
Accuracy : 0.634804972138877
Balanced accuracy : 0.41521960189554896
Confusion matrix : [[7185    0  809]
 [ 982    0  118]
 [3203    0 1701]]
Precision : 0.4263954651148782
Recall : 0.41521960189554896

GNB CLASSIFIER :

Accuracy of the model (with training data) : 0.5441077441077441
Accuracy : 0.5292184597799686
Balanced accuracy : 0.50839902406908
Confusion matrix : [[4306 1947 1741]
 [ 406  502  192]
 [1544  760 2600]]
Precision : 0.47276861512353857
Recall : 0.50839902406908

LR CLASSIFIER :

Accuracy of the model (with training data) : 0.6404521404521405
Accuracy : 0.6362337476782397
Balanced accuracy : 0.4303573125738

In [None]:
"""
KNN Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.75045455 0.74787879 0.7470202 ]
Mean Accuracy : 0.7484511784511785
Balanced accuracy : [0.60005258 0.60452316 0.60178515]
Mean Balanced accuracy : 0.6021202964523646
Precision : [0.75045455 0.74787879 0.7470202 ]
Mean Precision : 0.7484511784511785
Recall : [0.75045455 0.74787879 0.7470202 ]
Mean Recall : 0.7484511784511785


In [None]:
"""
SVC Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
random_sample = random.sample(range(X_water_pumps.shape[0]), int(X_water_pumps.shape[0]*0.5))
X_subset = X_water_pumps.iloc[random_sample]
y_subset = y_water_pumps.iloc[random_sample]
scores = cross_validate(pipe, X=X_subset, y=y_subset, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.63949495 0.64515152 0.63818182]
Mean Accuracy : 0.6409427609427609
Balanced accuracy : [0.4316604  0.43721806 0.43332878]
Mean Balanced accuracy : 0.43406907826509467
Precision : [0.63949495 0.64515152 0.63818182]
Mean Precision : 0.6409427609427609
Recall : [0.63949495 0.64515152 0.63818182]
Mean Recall : 0.6409427609427609


In [None]:
"""
GNB Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.5410101  0.55737374 0.42641414]
Mean Accuracy : 0.5082659932659933
Balanced accuracy : [0.51972741 0.5166385  0.50244302]
Mean Balanced accuracy : 0.5129363107406023
Precision : [0.5410101  0.55737374 0.42641414]
Mean Precision : 0.5082659932659933
Recall : [0.5410101  0.55737374 0.42641414]
Mean Recall : 0.5082659932659933


In [None]:
"""
LR Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.64494949 0.64035354 0.63893939]
Mean Accuracy : 0.6414141414141413
Balanced accuracy : [0.44710777 0.44570636 0.44404247]
Mean Balanced accuracy : 0.4456188681784871
Precision : [0.64494949 0.64035354 0.63893939]
Mean Precision : 0.6414141414141413
Recall : [0.64494949 0.64035354 0.63893939]
Mean Recall : 0.6414141414141413


In [None]:
"""
KNN Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.75191919 0.74267677 0.74676768]
Mean Accuracy : 0.7471212121212121
Balanced accuracy : [0.59967273 0.60327531 0.60239759]
Mean Balanced accuracy : 0.6017818731613791
Precision : [0.75191919 0.74267677 0.74676768]
Mean Precision : 0.7471212121212121
Recall : [0.75191919 0.74267677 0.74676768]
Mean Recall : 0.7471212121212121


In [None]:
"""
SVC Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
random_sample = random.sample(range(X_water_pumps.shape[0]), int(X_water_pumps.shape[0]*0.5))
X_subset = X_water_pumps.iloc[random_sample]
y_subset = y_water_pumps.iloc[random_sample]
scores = cross_validate(pipe, X=X_subset, y=y_subset, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.64545455 0.63656566 0.63919192]
Mean Accuracy : 0.6404040404040404
Balanced accuracy : [0.44025904 0.43347171 0.43516542]
Mean Balanced accuracy : 0.4362987232345117
Precision : [0.64545455 0.63656566 0.63919192]
Mean Precision : 0.6404040404040404
Recall : [0.64545455 0.63656566 0.63919192]
Mean Recall : 0.6404040404040404


In [None]:
"""
GNB Classifier with Stratified kfolds for water pumps 
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.535      0.45964646 0.53888889]
Mean Accuracy : 0.5111784511784512
Balanced accuracy : [0.52057768 0.50503581 0.51724262]
Mean Balanced accuracy : 0.51428536847884
Precision : [0.535      0.45964646 0.53888889]
Mean Precision : 0.5111784511784512
Recall : [0.535      0.45964646 0.53888889]
Mean Recall : 0.5111784511784512


In [None]:
"""
LR Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

Accuracy : [0.64530303 0.64141414 0.6379798 ]
Mean Accuracy : 0.6415656565656566
Balanced accuracy : [0.44779315 0.44715198 0.44238003]
Mean Balanced accuracy : 0.445775055474539
Precision : [0.64530303 0.64141414 0.6379798 ]
Mean Precision : 0.6415656565656566
Recall : [0.64530303 0.64141414 0.6379798 ]
Mean Recall : 0.6415656565656566
