In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import termcolor
from termcolor import colored
import warnings

from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
print(colored('\nAll libraries imported succesfully', 'green'))

ModuleNotFoundError: No module named 'seaborn'

In [None]:
pd.options.mode.copy_on_write = True # Allow re-write on variable
sns.set_style('darkgrid') # Seaborn style
warnings.filterwarnings('ignore') # Ignore warnings
pd.set_option('display.max_columns', None) # Setting this option will print all collumns of a dataframe
pd.set_option('display.max_colwidth', None) # Setting this option will print all of the data in a feature

In [None]:
print(colored('\nAll libraries Configed succesfully.', 'green'))

In [None]:
# import data with pandas
data = pd.read_csv('water_potability.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe().loc[['min', '50%', 'mean', 'max', 'std']].T.style.background_gradient(axis=1)

In [None]:
columns_name = data.columns
for index, col_name in enumerate(columns_name):
    print(index, col_name)

In [None]:
fig, ax = plt.subplots(9, 1, figsize=(10, 20))
fig.subplots_adjust(hspace=0.75)
for i in range(9) :
    # Ax
    sns.boxplot(x=columns_name[i], data=data, ax=ax[i])

In [None]:
tar = data['Potability'].value_counts()
print(tar)

plt.pie(tar, labels=[0, 1], explode=[0, 0.01], autopct='%.f%%', shadow=True)
plt.legend()
plt.show()

In [None]:
corr = data.corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr, annot=True, fmt='.2f', linewidths=0.5, linecolor='white', mask=np.triu(corr), cmap='Blues')
plt.show()

In [None]:
sns.pairplot(data, hue='Potability', corner=True, palette='Blues')
plt.show()

In [None]:
data.isna().sum()

In [None]:
print("Number of all samples : ", len(data))

In [None]:
print('Percentage(%) of nulls for each columns : \n')
print((data.isna().sum()/len(data))*100)

In [None]:
print("For Potability=1")
data[data.Potability==1][['ph', 'Sulfate', 'Trihalomethanes']].median()

In [None]:
print("For Potability=1")
data[data.Potability==0][['ph', 'Sulfate', 'Trihalomethanes']].median()

<div style="font-size:120%; font-weight:500; background-color:#edfeff"><p></p>➡️ For both Potability=1 and Potability=0, medians are approximately equall.</p>
<p>➡️Replace them all with their meadians</p>
</div>

In [None]:
data['ph'] = data['ph'].fillna(value=data['ph'].median())
data['Sulfate'] = data['Sulfate'].fillna(value=data['Sulfate'].median())
data['Trihalomethanes'] = data['Trihalomethanes'].fillna(value=data['Trihalomethanes'].median())

In [None]:
data.isna().sum().sum()

<div style="font-size:120%; font-weight:500; background-color:#edfeff">
    <p>
        ➡️ Use <code>Potability</code> as target.  👍
    </p>
    <p>
        ➡️ Using <code>MinMaxScaler()</code> to map values in range [0, 1] (Normalization)
    </p>
</div>

In [None]:
# Define X, y
X = data.drop(columns='Potability')
y = data.Potability

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
df.describe().loc[['min', 'mean', 'std', 'max']].T.style.background_gradient(axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
# A function to plot "Confusion Matrix" and "Classification Report"
def plot_result(y_pred) :
    '''
    1) plot Confusion Matrix
    2) plot Classification Report
    '''
    fig, ax = plt.subplots(1, 2, figsize=(15, 4))
    fig.tight_layout()
    #AX left - Confusion Matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    ax[0]=sns.heatmap(cm, cmap='Blues', annot=True, fmt='', linewidths=0.5, ax=ax[0])
    ax[0].set_xlabel('Predicted labels', fontsize=18)
    ax[0].set_ylabel('True labels', fontsize=18)
    ax[0].set_title('Confusion Matrix', fontsize=25)
    ax[0].xaxis.set_ticklabels(['0', '1'])
    ax[0].yaxis.set_ticklabels(['0', '1'])
    #
    # AX Right - Classification Report
    cr = pd.DataFrame(metrics.classification_report(y_test, y_pred, digits=3, output_dict=True)).T
    cr.drop(columns='support', inplace=True)
    ax[1] = sns.heatmap(cr, cmap='Blues', annot=True, fmt='0.3f', linewidths=0.5, ax=ax[1])
    ax[1].xaxis.tick_top()
    ax[1].set_title('Classification Report', fontsize=25)
    plt.show()

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'C' : [0.001, 1, 1000],
    'class_weight' : ['balanced', None],
    'solver' : ['liblinear', 'sag'],
    'penalty' : ['l2'],
    'n_jobs' : [-1],
}

lr = LogisticRegression()
lr_cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=20).fit(X_train, y_train)

print(colored('Tuned hyper parameters :\n{}'.format(lr_cv.best_params_), 'blue'))

In [None]:
lr = LogisticRegression(**lr_cv.best_params_).fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

lr_score = round(lr.score(X_test, y_test), 3)
print(colored('LogisticRegression Score : {}'.format(lr_score), 'green'))

In [None]:
plot_result(y_pred_lr)

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'n_estimators' : [1000],
    'criterion' : ['log_loss'],
    'max_features' : ['sqrt'],
    'n_jobs' : [-1]
}

rf = RandomForestClassifier()
rf_cv = GridSearchCV(estimator=rf, param_grid=parameters, cv=20).fit(X_train, y_train)
print(colored('Tuned hyper parameters :\n{}'.format(rf_cv.best_params_), 'blue'))

In [None]:
rf = RandomForestClassifier(**rf_cv.best_params_).fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_score = round(rf.score(X_test, y_test), 3)
print(colored('RandomForestClassifier Score : {}'.format(rf_score), 'green'))

In [None]:
plot_result(y_pred_rf)

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'hidden_layer_sizes' : [500],
    'activation' : ['logistic'],
    'alpha' : [0.0001],
    'batch_size' : [200],
    
}

mlp = MLPClassifier()
mlp_cv = GridSearchCV(estimator=mlp, param_grid=parameters, cv=20).fit(X_train, y_train)

print(colored('Tuned hyper parameters :\n{}'.format(mlp_cv.best_params_), 'blue'))

In [None]:
mlp = MLPClassifier(**mlp_cv.best_params_).fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)

mlp_score = round(mlp.score(X_test, y_test), 3)
print(colored('MLPClassifier Score : {}'.format(mlp_score), 'green'))

In [None]:
plot_result(y_pred_mlp)

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'n_neighbors' : list(np.arange(3, 50, 2)),
    'weights': ['uniform', 'distance'],
    'p' : [1, 2, 3, 4]
}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator=knn, cv=10, param_grid=parameters).fit(X_train, y_train)

print(colored('Tuned hyper parameters :\n{}'.format(knn_cv.best_params_), 'blue'))

In [None]:
knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

knn_score = round(knn.score(X_test, y_test), 3)
print(colored('KNeighborsClassifier Score : {}'.format(knn_score), 'green'))

In [None]:
plot_result(y_pred_knn)

In [None]:
result = pd.DataFrame({
    'Algorithm' : ['RandomForestClassifier', 'LogisticRegression', 'KNeighborsClassifier', 'MLPClassifier'],
    'Score' : [rf_score, lr_score,  knn_score, mlp_score]
})


result.style.background_gradient()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))

sns.barplot(x='Algorithm', y='Score', data=result)
ax.bar_label(ax.containers[0], fmt='%.3f')
ax.set_xticklabels(labels=result.Algorithm, rotation=300)
plt.show()

In [None]:
# Quantum Approach (Classical SVC vs Quantum SVC)
# 20% for training, same split as classical so can be used 
import time 
import qiskit
import sklearn.decomposition
from sklearn.discriminant_analysis import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_aer import Aer 
from qiskit.primitives import Sampler
from qiskit_algorithms.utils import algorithm_globals
from qiskit_algorithms.state_fidelities import ComputeUncompute

algorithm_globals.random_seed = 12345 

qiskit.__version__

X = X_train
Y = y_train 

sample_train, sample_test, label_train, label_test = train_test_split(X,Y, test_size = 0.1, random_state=1)
print("Original Size:")
print(sample_train.shape,sample_test.shape)
print()
# Reduce dimensions
n_dim = X_train.shape[1]
pca = PCA(n_components=n_dim).fit(sample_train)
sample_train = pca.transform(sample_train)
sample_test = pca.transform(sample_test)

# Normalise
std_scale = StandardScaler().fit(sample_train)
sample_train = std_scale.transform(sample_train)
sample_test = std_scale.transform(sample_test)

# Scale
samples = np.append(sample_train, sample_test, axis=0)
minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
sample_train = minmax_scale.transform(sample_train)
sample_test = minmax_scale.transform(sample_test)

print("After Cut:")
# Select
train_size = 1000
# train_size = 100
sample_train = sample_train[:train_size]
label_train = label_train[:train_size]

test_size = 336
# test_size = 30
sample_test = sample_test[:test_size]
label_test = label_test[:test_size]
print(sample_train.shape,sample_test.shape)

In [None]:
zz_map = ZZFeatureMap(feature_dimension=len(sample_train[0]), reps=2, entanglement='linear')
sampler = Sampler()
fidelity = ComputeUncompute(sampler=sampler)
zz_kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=zz_map)

In [None]:
from qiskit_machine_learning.algorithms import QSVC

qsvc = QSVC(quantum_kernel=zz_kernel)

qsvc.fit(sample_train, label_train)

qsvc_score = qsvc.score(sample_test, label_test)

print(f"QSVC classification test score: {qsvc_score}")

In [None]:
# zz_circuit = zz_kernel.construct_circuit(sample_train[0], sample_train[1])
start = time.perf_counter()
matrix_train = zz_kernel.evaluate(x_vec=sample_train)
matrix_test = zz_kernel.evaluate(x_vec=sample_test, y_vec=sample_train)
end = time.perf_counter()
print(matrix_train.shape,matrix_test.shape)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].imshow(np.asmatrix(np.log(matrix_train)),
              interpolation='nearest', origin='upper', cmap='Blues')
axs[0].set_title("training kernel matrix")
axs[1].imshow(np.asmatrix(matrix_test),
              interpolation='nearest', origin='upper', cmap='Reds')
axs[1].set_title("testing kernel matrix")
plt.show()

print(round(end-start,2),' (s)')

In [None]:
zzpc_svc = SVC(kernel='precomputed')
zzpc_svc.fit(matrix_train, label_train)
zzpc_score = zzpc_svc.score(matrix_test, label_test)

print(f'Precomputed kernel classification test score: {zzpc_score}')

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Obtain the predicted labels for the val data
predicted_labels = zzpc_svc.predict(matrix_test)

# Calculate the accuracy score using the true labels and the predicted labels
accuracy = accuracy_score(label_test, predicted_labels)

# Calculate the F1 score using the true labels and the predicted labels
f1 = f1_score(label_test, predicted_labels, average='weighted')

# Calculate the sensitivity and specificity scores using the true labels and the predicted labels
sensitivity = recall_score(label_test, predicted_labels, average='weighted')
specificity = precision_score(label_test, predicted_labels, average='weighted', zero_division=1)

# Generate the confusion matrix using the true labels and the predicted labels
cm = confusion_matrix(label_test, predicted_labels, labels=[0,1])

# Define the dictionary to map integers to spectral types
sp_type_dict = {0: 'O', 1: '1'}

# Compute the confusion matrix
matrix = confusion_matrix(label_test, predicted_labels, labels=[0, 1])

# Create the ConfusionMatrixDisplay object with updated labels
cf = ConfusionMatrixDisplay(matrix, display_labels=[sp_type_dict[i] for i in range(2)])

# Plot the confusion matrix
cf.plot(cmap="BuGn")
plt.show()

# Print the accuracy, F1 score, sensitivity, specificity, confusion matrix, and precomputed kernel classification val score
print(f'Accuracy: {round(accuracy,3)}')
print(f'F1 score: {round(f1,3)}')
print(f'Sensitivity: {round(sensitivity,3)}')
print(f'Specificity: {round(specificity,3)}')
print(f'Confusion matrix:\n{cm}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

classical_kernels = ['linear', 'poly', 'rbf', 'sigmoid']

best_kernel = None
best_score = 0

for kernel in classical_kernels:
    classical_svc = SVC(kernel=kernel)
    classical_svc.fit(sample_train, label_train)
    classical_score = classical_svc.score(sample_test, label_test)

    print('%s kernel classification test score:  %0.2f' % (kernel, classical_score))
    
    if classical_score > best_score:
        best_kernel = kernel
        best_score = classical_score

# Train the SVC model with the best kernel.
best_svc = SVC(kernel=best_kernel)
best_svc.fit(sample_train, label_train)

# Test the model on the test data.
y_pred = best_svc.predict(sample_test)

# Calculate the confusion matrix and performance metrics.

accuracy = accuracy_score(label_test, y_pred)
f1 = f1_score(label_test, y_pred, average='weighted')

# Calculate the sensitivity and specificity scores using the true labels and the predicted labels
sensitivity = recall_score(label_test, y_pred, average='weighted')
specificity = precision_score(label_test, y_pred, average='weighted', zero_division=1)

print()

# Output the results.
print(f'Accuracy: {round(accuracy,3)}')
print(f'F1 score: {round(f1,3)}')
print(f'Sensitivity: {round(sensitivity,3)}')
print(f'Specificity: {round(specificity,3)}')
print(f'Confusion matrix:\n{cm}')

# Generate the confusion matrix using the true labels and the predicted labels
cm = confusion_matrix(label_test, y_pred, labels=[0,1])

# Define the dictionary to map integers to spectral types
sp_type_dict = {0: 'O', 1: '1'}

# Compute the confusion matrix
matrix = confusion_matrix(label_test, y_pred, labels=[0, 1])

# Create the ConfusionMatrixDisplay object with updated labels
cf = ConfusionMatrixDisplay(matrix, display_labels=[sp_type_dict[i] for i in range(2)])

# Plot the confusion matrix
cf.plot(cmap='BuGn')
plt.show()