In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
data = pd.read_csv('symbipredict_2022.csv')

# Display the first few rows of the dataset
print(data.head())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [3]:
# Identify columns with missing values
missing_values = data.isnull().sum()


In [4]:
missing_values

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [5]:
#'prognosis' is the target variable and it is categorical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['prognosis'] = le.fit_transform(data['prognosis'])

In [6]:
X = data.drop('prognosis', axis=1)
y = data['prognosis']

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [10]:
# Initialize the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cross-validation
scores = cross_val_score(rfc, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        21
           7       1.00      1.00      1.00        20
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        29
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        18
          13       1.00      1.00      1.00        25
          14       1.00      1.00      1.00        22
          15       1.00      1.00      1.00        31
          16       1.00      1.00      1.00

In [12]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model on the test set
y_pred = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        21
           7       1.00      1.00      1.00        20
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        29
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        18
          13       1.00      1.00      1.00        25
          14       1.00      1.00      1.00        22
          15       1.00      1.00      1.00        31
          16       1.00      1.00      1.00

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the DNN model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(np.unique(y_train)), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.4617 - loss: 2.4745 - val_accuracy: 1.0000 - val_loss: 0.0442
Epoch 2/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9951 - loss: 0.0965 - val_accuracy: 1.0000 - val_loss: 0.0046
Epoch 3/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9990 - loss: 0.0305 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 4/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9974 - loss: 0.0185 - val_accuracy: 1.0000 - val_loss: 6.6851e-04
Epoch 5/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9994 - loss: 0.0104 - val_accuracy: 1.0000 - val_loss: 3.7632e-04
Epoch 6/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9991 - loss: 0.0092 - val_accuracy: 1.0000 - val_loss: 2.0180e-04
Epoch 7/50
[1m1

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 6.6905e-05 - val_accuracy: 1.0000 - val_loss: 1.4202e-07
Epoch 50/50
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 8.4221e-05 - val_accuracy: 1.0000 - val_loss: 6.4346e-08


<keras.src.callbacks.history.History at 0x238d9846e90>

In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 8.1566e-08
Test Accuracy: 100.00%


In [None]:
import shap

# Create a SHAP explainer for the Deep Learning model
explainer_dl = shap.KernelExplainer(model.predict, shap.sample(X_train, 100)) # Sample 100 background data points

# Get SHAP values for the test set
shap_values_dl = explainer_dl.shap_values(X_test)

# Plot the SHAP values for a specific instance
shap.force_plot(explainer_dl.expected_value[0], shap_values_dl[0,:], X_test.iloc[0,:], matplotlib=True)

In [29]:
# Save Random Forest model
joblib.dump(rfc, 'rf_model.joblib')

# Save Deep Learning model
model.save('dl_model.h5')

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')



['scaler.joblib']