### Data Pre-processing


About Dataset
1. Health Advisor
Context
* 1)Gender
* 2)Age
* 3)SBP (mm Hg)
* 4)DBP (mm Hg)
* 5)Heart_rate (bpm)
* 6)Glucose (mg/dL)
* 7)SpO2 (Oxygen level)
* 8)Temprature (F)
* 9)Health_status

Health_status
* 1=healthy
* 2=high BP
* 3=low BP
* 4=high sugar
* 5=low sugar
* 6=low oxygen
* 7=high temperature
* 8=heartbeat is high
* 9=risk

2. Disease


In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import pandas as pd
import numpy as np
import joblib
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
root_top = '/content/drive/MyDrive/D-A/TELE SEARCH/'



# Health Advisor

In [None]:
root = '/content/drive/MyDrive/D-A/TELE SEARCH/Advisor Model/'
df = pd.read_csv(root + 'Health Advisor.csv')

In [None]:
df.columns

Index(['sr no.', 'Gender', 'Age', 'SBP', 'HBP', 'heart_rate', 'Glucose',
       'SpO2', 'Temprature', 'Health_status', 'Unnamed: 10', 'Unnamed: 11'],
      dtype='object')

In [None]:
df = df[['SBP', 'HBP', 'heart_rate', 'Glucose', 'SpO2', 'Temprature', 'Health_status']]

Encode the gender column

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['SBP', 'HBP', 'heart_rate', 'Glucose', 'SpO2', 'Temprature']] = imputer.fit_transform(df[['SBP', 'HBP', 'heart_rate', 'Glucose', 'SpO2', 'Temprature']])
df.head(5)

Unnamed: 0,SBP,HBP,heart_rate,Glucose,SpO2,Temprature,Health_status
0,145.0,84.0,116.0,128.0,98.0,97.8,1
1,150.0,90.0,110.0,120.0,99.0,97.8,2
2,146.0,85.0,124.0,140.0,97.0,98.0,1
3,180.0,109.0,123.0,145.0,98.0,99.0,8
4,145.0,87.0,123.0,205.0,97.0,96.7,4


Split Data into train_set and test_set

In [None]:
data = df
data.to_csv(root + 'cleaned.csv', index=False)

X = data.drop('Health_status', axis=1)
y = data['Health_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,SBP,HBP,heart_rate,Glucose,SpO2,Temprature,Health_status
0,145.0,84.0,116.0,128.0,98.0,97.8,1
1,150.0,90.0,110.0,120.0,99.0,97.8,2
2,146.0,85.0,124.0,140.0,97.0,98.0,1
3,180.0,109.0,123.0,145.0,98.0,99.0,8
4,145.0,87.0,123.0,205.0,97.0,96.7,4


Build a Random Forest model

In [None]:
random_forest_model = RandomForestClassifier(random_state=42)

random_forest_model.fit(X_train.values, y_train.values)

y_pred = random_forest_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)


Accuracy: 0.9365079365079365


Build a decision tree model

In [None]:

decision_tree_model = DecisionTreeRegressor(random_state=42)

decision_tree_model.fit(X_train.values, y_train.values)

y_pred = decision_tree_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.9206349206349206


Build a Naive Bayes model

In [None]:
nb_model = GaussianNB()

nb_model.fit(X_train.values, y_train.values)


y_pred = nb_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.8888888888888888


Build a SVM classifier model

In [None]:
svm = SVC()

svm.fit(X_train.values, y_train.values)

y_pred = svm.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.7936507936507936


Export trained models

In [None]:
joblib.dump(random_forest_model, root + 'random_forest_model.pkl')
joblib.dump(decision_tree_model, root + 'decision_tree_model.pkl')
joblib.dump(nb_model, root + 'nb_model.pkl')
joblib.dump(svm, root + 'svm.pkl')

['/content/drive/MyDrive/D-A/TELE SEARCH/svm.pkl']

Prediction

In [None]:

new_data = [[145.0,	84.0,	116.0,	128.0,	98.0,	100	]]

prediction_random_forest_model = random_forest_model.predict(new_data)
prediction_decision_tree_model = decision_tree_model.predict(new_data)
prediction_nb_model = nb_model.predict(new_data)
prediction_svm = svm.predict(new_data)

print('Predicted health status by RandomForestCSFModel:', prediction_random_forest_model[0])
print('Predicted health status by Decision Tree:', prediction_decision_tree_model[0])
print('Predicted health status by Naive Bayes:', prediction_nb_model[0])
print('Predicted health status by SVM:', prediction_svm[0])


Predicted health status by RandomForestCSFModel: 7
Predicted health status by Decision Tree: 7.0
Predicted health status by Naive Bayes: 7
Predicted health status by SVM: 1


# Disease Prediction

function support to format string

In [37]:
def format_column(key):
    if pd.notna(key):
        words = str(key).strip()
        return words.replace(" ", "")
    else:
        return key


def format_name(key):
    words = key.strip()
    words = words.replace(" ", "")
    words = words.split('_')
    return ' '.join(word.capitalize() for word in words)

dataset

In [38]:
root = '/content/drive/MyDrive/D-A/TELE SEARCH/Disease Model/'
df = pd.read_csv(root + "dataset_train.csv")
df_sevrity = pd.read_csv(root + "Symptom-severity.csv")
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Drug Reaction,itching,stomach_pain,burning_micturition,spotting_ urination,,,,,,,,,,,,,
1,GERD,stomach_pain,acidity,ulcers_on_tongue,vomiting,cough,chest_pain,,,,,,,,,,,
2,Jaundice,itching,vomiting,fatigue,weight_loss,high_fever,yellowish_skin,dark_urine,abdominal_pain,,,,,,,,,
3,Chronic cholestasis,itching,vomiting,yellowish_skin,nausea,loss_of_appetite,abdominal_pain,yellowing_of_eyes,,,,,,,,,,
4,Tuberculosis,chills,vomiting,fatigue,weight_loss,cough,high_fever,breathlessness,loss_of_appetite,mild_fever,yellowing_of_eyes,swelled_lymph_nodes,malaise,phlegm,chest_pain,blood_in_sputum,,


Add column "Symptoms" contain all symptoms symptoms as array



In [39]:
df["Symptoms"] = 0
records = df.shape[0]
def format_column(key):
    if pd.notna(key):
        words = str(key).strip()
        return words.replace(" ", "")
    else:
        return key

for i in range(records):
    values = df.iloc[i].tolist()
    values = [format_column(value) for value in values]

    if 0 in values:
        df["Symptoms"][i] = values[1:values.index(0)]
    else:
        df["Symptoms"][i] = values[1:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Symptoms"][i] = values[1:]


Get all symptoms exist in dataset

In [40]:
column_values = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17']].values.ravel()


symps = pd.unique(column_values)
symps = symps.tolist()
symps = [i for i in symps if str(i) != "nan"]
symps = [format_column(i) for i in symps]


New suitable dataset format

In [48]:
symptoms = pd.DataFrame(columns = symps,index = df.index)
symptoms["Symptoms"] = df["Symptoms"]
for i in symps:
    symptoms[i] = symptoms.apply(lambda x:1 if i in x.Symptoms else 0, axis=1)
symptoms["Disease"] = df["Disease"]
symptoms = symptoms.drop("Symptoms",axis=1);                                                                                                                                                                                                                                                                                                      #symptoms = pd.read_csv(root + 'cleaned.csv')

symptoms.head()

Unnamed: 0,itching,stomach_pain,burning_micturition,spotting_urination,acidity,ulcers_on_tongue,vomiting,cough,chest_pain,fatigue,...,rusty_sputum,passage_of_gases,internal_itching,shivering,blister,watering_from_eyes,patches_in_throat,extra_marital_contacts,muscle_wasting,Disease
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction
1,0,1,0,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,GERD
2,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Jaundice
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Tuberculosis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
19996,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Alcoholic hepatitis
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Impetigo
19998,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chicken pox


Split data to test and train

In [42]:
df_data = symptoms.drop('Disease' , axis =1)
label = symptoms["Disease"]
X_train, X_test, y_train, y_test = train_test_split(df_data, label, shuffle=True, train_size = 0.80)

Random Forest Classifier Model

In [43]:
rfc_model = RandomForestClassifier()

rfc_model.fit(X_train.values, y_train.values)

y_pred = rfc_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.9891435464414958


Build a SVM classifier model

In [46]:
svm = SVC()

svm.fit(X_train.values, y_train.values)

y_pred = svm.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.9886265724625194


Build a Naive Bayes model

In [47]:
nb_model = GaussianNB()

nb_model.fit(X_train.values, y_train.values)


y_pred = nb_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.965190418748923


Export model

In [45]:
joblib.dump(rfc_model, root + 'random_forest_model.pkl')
joblib.dump(nb_model, root + 'nb_model.pkl')
joblib.dump(svm, root + 'svm.pkl')

['/content/drive/MyDrive/D-A/TELE SEARCH/Disease Model/svm.pkl']

In [None]:
import random
rf_ds_model = joblib.load(root_top + "Disease Model" + '/random_forest_model.pkl')
nb_ds_model = joblib.load(root_top + "Disease Model" + '/nb_model.pkl')
svm_ds = joblib.load(root_top + "Disease Model" + '/svm.pkl')

a = [random.choices([1, 0], weights=[1 - 0.5, 0.5], k=131)]

rf_result1 = rf_ds_model.predict(a)[0]
rf_result2 = svm_ds.predict(a)[0]
rf_result3 = nb_ds_model.predict(a)[0]

print(rf_result1,"-",rf_result2,"-",rf_result3)

Hypoglycemia - Common Cold - Dimorphic hemmorhoids(piles)
