<a href="https://colab.research.google.com/github/abhihaveri/HealthcareAbhishek/blob/main/Recheck_Model_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import joblib
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score

In [7]:
# Load the dataset
data = pd.read_csv('symbipredict_2022.csv')

# Display the first few rows of the dataset
print(data.head())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [8]:
# Identify columns with missing values
missing_values = data.isnull().sum()

In [9]:
#Seperating the target column as Y and symptoms as X
X = data.drop('prognosis', axis=1)
y = data['prognosis']

In [10]:
#'prognosis' is the target variable and it is categorical so we labelling it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Random Forest Model**


In [14]:
# Initialize the model
rfc  = RandomForestClassifier(random_state=42)

In [16]:
# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

In [17]:
# Perform grid search for hyperparameter tuning
rf_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=2, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

In [18]:
# Predict on the test set
rf_pred  = rf_model.predict(X_test_scaled)

In [19]:
# Evaluate Random Forest Model
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)
print(classification_report(y_test, rf_pred))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        25
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        25
          15       1.00      1.00      1.00        24
          16       1.00      1.00      1.00        24

In [20]:
# Cross-validation
scores = cross_val_score(rfc, X_train, y_train, cv=10)
print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())

Cross-validation scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Average cross-validation score: 1.0


In [21]:
# Calculate precision, recall, F1 score using the correct labels
classification_report_rf = classification_report(y_test, rf_pred, output_dict=True)

rf_precision = classification_report_rf['weighted avg']['precision']
rf_recall = classification_report_rf['weighted avg']['recall']
rf_f1_score = classification_report_rf['weighted avg']['f1-score']

# Calculate ROC-AUC score
rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test), multi_class='ovr')



In [22]:
# Evaluate Random Forest Model
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest F1 Score:", rf_f1_score)
print("Random Forest ROC-AUC:", rf_roc_auc)

Random Forest Accuracy: 1.0
Random Forest Precision: 1.0
Random Forest Recall: 1.0
Random Forest F1 Score: 1.0
Random Forest ROC-AUC: 0.9728702988920734


**Deep Learning Model**

In [23]:
# Train the Deep Learning model
dl_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_train)), activation='softmax')
])
dl_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dl_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.1863 - loss: 3.3426 - val_accuracy: 0.9660 - val_loss: 1.0441
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8281 - loss: 1.0097 - val_accuracy: 1.0000 - val_loss: 0.1254
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9375 - loss: 0.3285 - val_accuracy: 1.0000 - val_loss: 0.0294
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9713 - loss: 0.1761 - val_accuracy: 1.0000 - val_loss: 0.0108
Epoch 5/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9770 - loss: 0.1228 - val_accuracy: 1.0000 - val_loss: 0.0056
Epoch 6/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9838 - loss: 0.0851 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 7/50
[1m100/100[0m 

<keras.src.callbacks.history.History at 0x7d20d25d9e10>

In [24]:
# Evaluate Deep Learning Model
dl_predictions = dl_model.predict(X_test)
dl_loss, dl_accuracy = dl_model.evaluate(X_test, y_test)
print("Deep Learning Model Accuracy:", dl_accuracy)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.2030
Deep Learning Model Accuracy: 1.0


**KNN Classifier**

In [29]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn = KNeighborsClassifier()

# Define hyperparameters for tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform grid search for hyperparameter tuning
knn_model = GridSearchCV(estimator=knn, param_grid=param_grid, cv=2, n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)

# Predict on the test set
knn_pred = knn_model.predict(X_test_scaled)

# Evaluate KNN Model
knn_accuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy:", knn_accuracy)
print(classification_report(y_test, knn_pred))


KNN Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        25
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        25
          15       1.00      1.00      1.00        24
          16       1.00      1.00      1.00        24
         