# Disease Prediction Using Machine Learning 

## Notebook Imports

In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
import seaborn as sns
from scipy.stats import mode

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [146]:
# get the data

data = pd.read_csv('data/disease.csv')

# display first five rows of dataset
data.head()

Unnamed: 0,cough,fever,shortness of breath,pain chest,diarrhea,vomiting,unresponsiveness,asthenia,dyspnea,pain abdominal,...,cicatrisation,mediastinal shift,impaired cognition,snuffle,chill,headache,guaiac positive,decreased body weight,sore to touch,disease
0,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,hypertensive disease
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,coronary heart disease
2,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,failure heart congestive
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,asthma
4,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,chronic obstructive airway disease


In [147]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   cough                      25 non-null     int64 
 1   fever                      25 non-null     int64 
 2   shortness of breath        25 non-null     int64 
 3   pain chest                 25 non-null     int64 
 4   diarrhea                   25 non-null     int64 
 5   vomiting                   25 non-null     int64 
 6   unresponsiveness           25 non-null     int64 
 7   asthenia                   25 non-null     int64 
 8   dyspnea                    25 non-null     int64 
 9   pain abdominal             25 non-null     int64 
 10  vertigo                    25 non-null     int64 
 11  apyrexial                  25 non-null     int64 
 12  sweat                      25 non-null     int64 
 13  nausea                     25 non-null     int64 
 14  dizziness   

In [148]:
data.describe()

Unnamed: 0,cough,fever,shortness of breath,pain chest,diarrhea,vomiting,unresponsiveness,asthenia,dyspnea,pain abdominal,...,bradycardia,cicatrisation,mediastinal shift,impaired cognition,snuffle,chill,headache,guaiac positive,decreased body weight,sore to touch
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,...,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,0.2,0.32,0.6,0.2,0.12,0.24,0.04,0.2,0.2,0.2,...,0.12,0.04,0.04,0.04,0.04,0.12,0.08,0.12,0.04,0.04
std,0.408248,0.476095,0.5,0.408248,0.331662,0.43589,0.2,0.408248,0.408248,0.408248,...,0.331662,0.2,0.2,0.2,0.2,0.331662,0.276887,0.331662,0.2,0.2
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [149]:
# Preprocess data: convert data into binary (0, 1)

columns = list(data.columns)[:-1]

total_sym = len(columns)

def to_valid_list(user_sym_list):
    if type(user_sym_list) != list:
        user_sym_list = to_list(user_sym_list)
    valid_list = []
    for i in range (0, total_sym):
        if columns[i] in user_sym_list:
            valid_list.append(1)
        else:
            valid_list.append(0)
    return(valid_list)

def to_list(string):
    user_sym_list = string.split(",")
    user_sym_list = [x.strip(' ') for x in user_sym_list]
    return(user_sym_list)

In [150]:
# Split data for training


y = data['disease']
X = data.drop('disease', axis=1)

X.shape, y.shape

((25, 73), (25,))

In [151]:
def NaiveBayes(user_symptoms, disease):    
    symptoms = to_valid_list(user_symptoms)
    symptoms_list = [symptoms]
    
    model = BernoulliNB()
    model.fit(X, y)    
  
    result = model.predict(symptoms_list[0:1])[0]    
    prob = pd.DataFrame(model.predict_proba(symptoms_list[0:1]), columns=model.classes_)    
   
    output = ''    
    
    if disease == 'all_diseases':
        pred_disease = f'From symptoms given, you are likely to have {result}, with {prob.at[0,result]*100:.2f}% probability.'
        output = pred_disease    
    else:
        specific_disease = f'From symptoms given, you are likely to have {disease} with {prob.at[0,disease]*100:.2f}% probability.'
        output = specific_disease
    
    return output

## Making Prediction

In [152]:
symptoms = "cough, shortness of breath, wheezing"
print(NaiveBayes(symptoms, "all_diseases"))

From symptoms given, you are likely to have asthma, with 38.80% probability.


In [153]:
symptoms = "cough, shortness of breath, wheezing"
print(NaiveBayes(symptoms, "pneumonia"))

From symptoms given, you are likely to have pneumonia with 4.85% probability.


In [154]:
symptoms = "cough, sdyspnea, shortness of breath"
print(NaiveBayes(symptoms, "chronic obstructive airway disease"))

From symptoms given, you are likely to have chronic obstructive airway disease with 12.70% probability.


# Model two

# Using a different dataset to test for a better model accuracy

In [155]:
# get the data

training = pd.read_csv('data/training.csv').dropna(axis = 1)
testing = pd.read_csv('data/testing.csv').dropna(axis = 1)

In [156]:
training.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [158]:
# preprocess/encode data

encoder = LabelEncoder()
training["prognosis"] = encoder.fit_transform(training["prognosis"])

In [159]:
# split data fro training

X = training.iloc[:,:-1]
y = training.iloc[:, -1]
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=.2, random_state=42)
 
print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (3936, 132), (3936,)
Test: (984, 132), (984,)


In [160]:
# models

def cv_scoring(estimator, X, y):
    return accuracy_score(y, estimator.predict(X))
 
# Initializing Models
models = {
    "SVC": SVC(),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42)
}
 
# Producing cross validation score for the models
for model_name in models:
    model = models[model_name]
    scores = cross_val_score(model, X, y, cv = 10, n_jobs = -1, scoring = cv_scoring)

In [161]:
# Ensemblem learning: Averaging different models to get higher accuracy and prediction


# Training SVM Classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)
preds = svm_model.predict(X_test)

# Training Naive Bayes Classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
preds = nb_model.predict(X_test)

# Training Random Forest Classifier
rf_model = RandomForestClassifier(random_state=18)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

In [162]:
np.max(rf_model.predict_proba(X_test))

1.0

In [163]:
# printing accuracies

print(f"Training Accuracy of SVM Classifier: {accuracy_score(y_train, svm_model.predict(X_train))*100}")
print(f"Test Accuracy of SVM Classifier: {accuracy_score(y_test, preds)*100}\n") 

print(f"Training Accuracy of Naive Bayes Classifier: {accuracy_score(y_train, nb_model.predict(X_train))*100}")
print(f"Testing Accuracy of Naive Bayes Classifier: {accuracy_score(y_test, preds)*100}\n")

print(f"Training Accuracy of Random Forest Classifier: {accuracy_score(y_train, rf_model.predict(X_train))*100}")
print(f"Testing Accuracy of Random Forest Classifier: {accuracy_score(y_test, preds)*100}")
 

Training Accuracy of SVM Classifier: 100.0
Test Accuracy of SVM Classifier: 100.0

Training Accuracy of Naive Bayes Classifier: 100.0
Testing Accuracy of Naive Bayes Classifier: 100.0

Training Accuracy of Random Forest Classifier: 100.0
Testing Accuracy of Random Forest Classifier: 100.0


# Making Predictions

In [164]:
symptoms = X.columns.values
 
# Creating a symptom index dictionary to encode the input symptoms into numerical form
symptom_index = {}
for index, value in enumerate(symptoms):
    symptom = " ".join([i.capitalize() for i in value.split("_")])
    symptom_index[symptom] = index

data_dict = {
    "symptom_index": symptom_index,
    "predictions_classes": encoder.classes_
}

In [165]:
# function for prediction

def predict_disease(symptoms):
    symptoms = symptoms.split(",")
     
    # creating input data for the models
    input_data = [0] * len(data_dict["symptom_index"])
    for symptom in symptoms:
        index = data_dict["symptom_index"][symptom]
        input_data[index] = 1
         
    # reshaping the input data and converting it into suitable format for model predictions
    input_data = np.array(input_data).reshape(1,-1)
     
    # generating individual outputs
    rf_prediction = data_dict["predictions_classes"][rf_model.predict(input_data)[0]]
    nb_prediction = data_dict["predictions_classes"][nb_model.predict(input_data)[0]]
    svm_prediction = data_dict["predictions_classes"][svm_model.predict(input_data)[0]]
     
    # making final prediction by taking mode of all predictions
    final_prediction = mode([rf_prediction, nb_prediction, svm_prediction])[0][0]
    
    probas = f'{np.max(rf_model.predict_proba(input_data))}'
    
    predictions = f'Model Predictions:\n\
                    From symptoms given, Random forest Predicted: {rf_prediction}\n\
                    From symptoms given, Naive Bayes Predicted: {nb_prediction}\n\
                    From symptoms given, SVM Predicted: {svm_prediction}\n\
                    From symptoms given, Ensembled Model Predicted: {final_prediction}'
    
    return predictions

In [166]:
# Testing the function

print(predict_disease('Itching,Skin Rash,Nodal Skin Eruptions'))

Model Predictions:
                    From symptoms given, Random forest Predicted: Fungal infection
                    From symptoms given, Naive Bayes Predicted: Fungal infection
                    From symptoms given, SVM Predicted: Fungal infection
                    From symptoms given, Ensembled Model Predicted: Fungal infection


In [167]:
# Testing the function

print(predict_disease('Skin Rash,Blackheads,Scurring'))

Model Predictions:
                    From symptoms given, Random forest Predicted: Acne
                    From symptoms given, Naive Bayes Predicted: Acne
                    From symptoms given, SVM Predicted: Acne
                    From symptoms given, Ensembled Model Predicted: Acne


In [168]:
# Testing the function

print(predict_disease('Skin Rash,Joint Pain,Skin Peeling,Silver Like Dusting'))

Model Predictions:
                    From symptoms given, Random forest Predicted: Psoriasis
                    From symptoms given, Naive Bayes Predicted: Psoriasis
                    From symptoms given, SVM Predicted: Psoriasis
                    From symptoms given, Ensembled Model Predicted: Psoriasis
