In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/disease-prediction-data/Training.csv
/kaggle/input/disease-prediction-data/Testing.csv


This code snippet imports necessary libraries for building a neural network model using TensorFlow and scikit-learn for data preprocessing and evaluation.

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

2024-06-05 05:25:23.881395: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 05:25:23.881527: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 05:25:24.048016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


This code reads a CSV file containing training data for disease prediction from a specified path.

In [3]:
training_data = pd.read_csv('/kaggle/input/disease-prediction-data/Training.csv')

This code removes the column 'Unnamed: 133' from the training data if it exists

In [4]:
if 'Unnamed: 133' in training_data.columns:
    training_data = training_data.drop(columns=['Unnamed: 133'])

This code creates a LabelEncoder object and encodes the 'prognosis' column in the training data, storing the encoded values in a new column named 'prognosis_encoded'.

In [5]:
label_encoder = LabelEncoder()
training_data['prognosis_encoded'] = label_encoder.fit_transform(training_data['prognosis'])


This code separates the features (X_train) and the target variable (y_train) from the training data, excluding both the original 'prognosis' column and the encoded 'prognosis_encoded' column.


In [6]:
X_train = training_data.drop(columns=['prognosis', 'prognosis_encoded'])
y_train = training_data['prognosis_encoded']

This code standardizes the features in X_train using StandardScaler, ensuring all features have a mean of 0 and a standard deviation of 1.

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [8]:
y_train = to_categorical(y_train)

This code splits the training data into training and validation sets (X_train, X_val, y_train, y_val) with a test size of 20% and a specified random state for reproducibility.

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2003)


This code defines a sequential neural network model with input dimensionality determined by the number of features in X_train, followed by two dense layers with ReLU activation and a softmax output layer. It compiles the model using the Adam optimizer and categorical cross-entropy loss, then trains it for 50 epochs with a batch size of 32, validating on X_val and y_val. Finally, it evaluates the model's performance on the validation set and prints the validation loss and accuracy.

In [10]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))


optimizer = Adam()
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6076 - loss: 1.9874 - val_accuracy: 1.0000 - val_loss: 0.0347
Epoch 2/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0218 - val_accuracy: 1.0000 - val_loss: 0.0086
Epoch 3/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0064 - val_accuracy: 1.0000 - val_loss: 0.0043
Epoch 4/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0031 - val_accuracy: 1.0000 - val_loss: 0.0025
Epoch 5/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0017
Epoch 6/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 0.0012
Epoch 7/50
[1m123/123[0m [32m━━━━━━━

In [11]:
test_data = pd.read_csv('/kaggle/input/disease-prediction-data/Testing.csv')

In [12]:
if 'Unnamed: 133' in test_data.columns:
    test_data = test_data.drop(columns=['Unnamed: 133'])

In [13]:
X_test = test_data.drop(columns=['prognosis'])
y_test = test_data['prognosis']


In [14]:
X_test = scaler.transform(X_test)

This code generates predictions (y_pred) for the test data (X_test) using the trained model and then converts these predictions into class labels (y_pred_labels) by selecting the index of the highest probability for each prediction.

In [15]:
y_test_encoded = label_encoder.transform(y_test)
y_test_categorical = to_categorical(y_test_encoded)
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [16]:
test_accuracy = accuracy_score(y_test_encoded, y_pred_labels)
print(f"Test Accuracy: {test_accuracy}")


Test Accuracy: 1.0
