In [1]:
# Importing necessary libraries
# NumPy provides support for large, multi-dimensional arrays and matrices.
import numpy as np
# Pandas provides support for data manipulation and analysis.
import pandas as pd
# Matplotlib.pyplot is a plotting library in Python that provides a MATLAB-like interface for creating visualizations.
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
data = pd.read_csv("CKD.csv")

In [3]:
# Map 'yes' to 1 and 'no' to 0 for the target variable 'classification'
data['classification'] = data['classification'].map({'yes': 1, 'no': 0})

In [4]:
# Create the dependent variable (target)
dependent = data[["classification"]]

In [5]:
# Check for missing values
print(data.isnull().sum())

age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hrmo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64


In [6]:
# Check the distribution of the target variable
print(data['classification'].value_counts())

1    249
0    150
Name: classification, dtype: int64


In [7]:
# Create the independent variables (features)
independent = data.drop(columns=["classification"])

In [8]:
# Convert categorical variables into dummy/indicator variables
independent = pd.get_dummies(independent, drop_first=True)
print(independent.head(4))


   age         bp   al   su         bgr         bu        sc         sod  \
0  2.0  76.459948  3.0  0.0  148.112676  57.482105  3.077356  137.528754   
1  3.0  76.459948  2.0  0.0  148.112676  22.000000  0.700000  137.528754   
2  4.0  76.459948  1.0  0.0   99.000000  23.000000  0.600000  138.000000   
3  5.0  76.459948  1.0  0.0  148.112676  16.000000  0.700000  138.000000   

        pot       hrmo  ...  rbc_normal  pc_normal  pcc_present  ba_present  \
0  4.627244  12.518156  ...           1          0            0           0   
1  4.627244  10.700000  ...           1          1            0           0   
2  4.400000  12.000000  ...           1          1            0           0   
3  3.200000   8.100000  ...           1          1            0           0   

   htn_yes  dm_yes  cad_yes  appet_yes  pe_yes  ane_yes  
0        0       0        0          1       1        0  
1        0       0        0          1       0        0  
2        0       0        0          1       0   

In [9]:
# Importing train_test_split from sklearn.model_selection
# This function is used for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

In [10]:
# Splitting data into training and testing sets
# Train and test split ratio is 70% and 30% respectively
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [11]:
# Standardizing the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
# Importing LogisticRegression from sklearn.linear_model
from sklearn.linear_model import LogisticRegression

In [16]:
# Importing GridSearchCV from sklearn.model_selection
from sklearn.model_selection import GridSearchCV

In [17]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type: L1 (Lasso) or L2 (Ridge)
    'C': [0.01, 0.1, 1, 10, 100]  # Inverse of regularization strength
}

In [19]:
# Initialize GridSearchCV with f1_macro scoring
grid_search = GridSearchCV(LogisticRegression(), param_grid=param_grid, cv=5)

In [20]:
# Fit the model to the training data
grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']})

In [21]:
# Print the best hyperparameters found by GridSearchCV
best_logistic_regression = grid_search.best_estimator_

In [22]:
# Predict on the test set
test_predictions = best_logistic_regression.predict(X_test)

In [23]:
# Retrieve cross-validation results
cv_results = grid_search.cv_results_

In [24]:
# Importing necessary libraries for evaluation metrics
from sklearn.metrics import confusion_matrix  # Importing confusion matrix to evaluate classification performance
from sklearn.metrics import classification_report  # Importing classification report to summarize classification performance
from sklearn.metrics import roc_curve, roc_auc_score  # Importing ROC curve and AUC score for binary classification evaluation

In [25]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[45  0]
 [ 1 74]]


In [26]:
#Classification report
classification_rep = classification_report(y_test, test_predictions)
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        45
           1       1.00      0.99      0.99        75

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



In [28]:
# ROC AUC score
test_probabilities = best_logistic_regression.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, test_probabilities)
print(f"ROC AUC Score: {roc_auc}")

ROC AUC Score: 1.0
