In [35]:
# Import
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import joblib
from joblib import load, dump
import tensorflow as tf

In [36]:
# main df 
main_df = pd.read_csv("./dataset/raw/dataset.csv")

In [37]:
# weight df
weights_df = pd.read_csv('./dataset/raw//Symptom-severity.csv')

In [38]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [39]:
#shuffling cause to avoid overfitting
main_df = shuffle(main_df, random_state= 42)

In [40]:
# removing _ from text/str
for column in main_df.columns:
    main_df[column] = main_df[column].str.replace('_', ' ')

In [41]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
373,Acne,skin rash,blackheads,scurring,,,,,,,,,,,,,,
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,,,,,,,,,,,,,
1550,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,,,,,,
3081,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,,,,,,,,,,,,,
3857,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,,,,,,,,,,


In [42]:
# extracting columns and data separate for stripping values
main_df_columns = main_df.columns
data = main_df[main_df_columns].values.flatten()

In [43]:
main_df_columns

Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')

In [44]:
data

array(['Acne', ' skin rash', ' blackheads', ..., nan, nan, nan],
      dtype=object)

In [45]:
# converting df values data into series for stripping white spaces
series_main_df_data = pd.Series(data)
series_main_df_data

0               Acne
1          skin rash
2         blackheads
3           scurring
4                NaN
            ...     
88555            NaN
88556            NaN
88557            NaN
88558            NaN
88559            NaN
Length: 88560, dtype: object

In [46]:
# striping white spaces 
series_main_df_data = series_main_df_data.str.strip()

In [47]:
# reshaping series into df shape to recreate df
series_main_df_data = series_main_df_data.values.reshape(main_df.shape)
series_main_df_data

array([['Acne', 'skin rash', 'blackheads', ..., nan, nan, nan],
       ['Acne', 'skin rash', 'pus filled pimples', ..., nan, nan, nan],
       ['Hyperthyroidism', 'fatigue', 'mood swings', ..., nan, nan, nan],
       ...,
       ['Dengue', 'skin rash', 'chills', ..., nan, nan, nan],
       ['Fungal infection', 'itching', 'skin rash', ..., nan, nan, nan],
       ['Drug Reaction', 'itching', 'skin rash', ..., nan, nan, nan]],
      dtype=object)

In [48]:
# creating dataframe from series and columns
stripped_main_df = pd.DataFrame(series_main_df_data, columns= main_df_columns)

In [49]:
stripped_main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin rash,blackheads,scurring,,,,,,,,,,,,,,
1,Acne,skin rash,pus filled pimples,blackheads,scurring,,,,,,,,,,,,,
2,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,,,,,,
3,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,,,,,,,,,,,,,
4,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,,,,,,,,,,


In [50]:
#filling out Nan values by 0
filledna_main_df = stripped_main_df.fillna(0)

In [51]:
filledna_main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,skin rash,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,skin rash,pus filled pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,0,0,0,0,0,0
3,AIDS,muscle wasting,patches in throat,high fever,extra marital contacts,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,itching,vomiting,yellowish skin,nausea,loss of appetite,abdominal pain,yellowing of eyes,0,0,0,0,0,0,0,0,0,0


In [52]:
# replacing _ from ' ' in weight df
weights_df['Symptom'] = weights_df['Symptom'].str.replace("_"," ")

In [53]:
weights_df.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5


In [54]:
#replacing values in place of symptoms in main_df with matching weight in weights_df
def replace_values_from_weight_to_main_df(value):
    match = weights_df.loc[weights_df['Symptom'] == value, 'weight']
    
    if not match.empty:
        return match.values[0]
    else:
        return value

In [55]:

final_transoformed_main_df = filledna_main_df.applymap(replace_values_from_weight_to_main_df)


  final_transoformed_main_df = filledna_main_df.applymap(replace_values_from_weight_to_main_df)


In [56]:
final_transoformed_main_df.head(20)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Acne,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Hyperthyroidism,4,3,3,5,3,6,5,4,2,2,6,0,0,0,0,0,0
3,AIDS,3,6,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Chronic cholestasis,1,5,3,5,4,4,4,0,0,0,0,0,0,0,0,0,0
5,Hypertension,3,7,4,4,3,0,0,0,0,0,0,0,0,0,0,0,0
6,Hypoglycemia,5,4,4,3,3,5,5,4,4,4,2,4,0,0,0,0,0
7,Arthritis,2,4,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Hepatitis B,1,4,2,3,4,4,4,4,4,6,5,2,0,0,0,0,0
9,Migraine,3,5,3,5,4,4,3,2,3,0,0,0,0,0,0,0,0


In [57]:
# writing processed dfs to use in fastAPI
weights_df.to_csv('./dataset/processed/weights_df.csv')
final_transoformed_main_df.to_csv('./dataset/processed/final_transoformed_main_df.csv')

In [58]:
# Independent varibles x
x = final_transoformed_main_df.iloc[:,1:].values
x

array([[3, 2, 2, ..., 0, 0, 0],
       [3, 2, 2, ..., 0, 0, 0],
       [4, 3, 3, ..., 0, 0, 0],
       ...,
       [3, 3, 3, ..., 0, 0, 0],
       [1, 3, 4, ..., 0, 0, 0],
       [1, 3, 5, ..., 0, 0, 0]], dtype=object)

In [59]:
# Dependent variable Y
y = final_transoformed_main_df['Disease'].values
y

array(['Acne', 'Acne', 'Hyperthyroidism', ..., 'Dengue',
       'Fungal infection', 'Drug Reaction'], dtype=object)

In [60]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state= 42)

In [61]:
# just checking shapes 
print(f"{x_train.shape}      {x_test.shape}")

(3936, 17)      (984, 17)


In [62]:

print(f"{y_train.shape}            {y_test.shape}")

(3936,)            (984,)


In [63]:
# model creation
model = Sequential()
# input layer
model.add(Dense(units=128, activation='relu', input_dim = x_train.shape[1] ))
model.add(Dropout(0.3))

# first hidden layer
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))

# 2nd hidden layer
model.add(Dense(units= 32, activation='relu'))

# op layer
model.add(Dense(units= len(final_transoformed_main_df['Disease']), activation='softmax'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [64]:
model.summary()

In [65]:
# compiling model with Adam as an optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])


In [66]:
# label encoding 
label_encoder = LabelEncoder()

# it is optional cause we replaced every categorical data into  numeric one 
for col in range(x_train.shape[1]):
    if isinstance(x_train[0,col], str):
        x_train[:, col] = label_encoder.fit_transform(x_train[:, col])
        x_test[:, col] = label_encoder.transform(x_test[:,col])

In [67]:
# encoding to dependent variable cause we did'nt make it numeric before
final_transoformed_main_df['Disease'] = final_transoformed_main_df['Disease'].astype(str)

label_encoder.fit(final_transoformed_main_df['Disease'].unique())
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [68]:
# replace '' with 0
x_train = np.where(x_train == '', 0, x_train)
x_test = np.where(x_test == '', 0, x_test)

# data type changing 
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

y_train =y_train.astype(np.int32)
y_test =y_test.astype(np.int32)

In [69]:
# model traingng
model.fit(x_train, y_train, epochs= 100, batch_size=32, validation_split=0.1, verbose=1)

Epoch 1/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.0305 - loss: 5.9915 - val_accuracy: 0.1320 - val_loss: 2.9132
Epoch 2/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1314 - loss: 2.8268 - val_accuracy: 0.3299 - val_loss: 2.1553
Epoch 3/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2544 - loss: 2.1979 - val_accuracy: 0.4873 - val_loss: 1.6893
Epoch 4/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3946 - loss: 1.7966 - val_accuracy: 0.6523 - val_loss: 1.2603
Epoch 5/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5063 - loss: 1.4884 - val_accuracy: 0.6599 - val_loss: 1.1030
Epoch 6/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5618 - loss: 1.2991 - val_accuracy: 0.7437 - val_loss: 0.9634
Epoch 7/100
[1m111/11

<keras.src.callbacks.history.History at 0x2487bbb80e0>

In [70]:
# finding test accuracy
test_loss, test_accuracy = model.evaluate(x_test, y_test)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9844 - loss: 0.0577


In [71]:
test_accuracy

0.9867886304855347

In [72]:
#saving label encoder cause we need it in FASTAPI after import model
joblib.dump(label_encoder, 'label_encoder_for_disease_prediction.pkl')

['label_encoder_for_disease_prediction.pkl']

In [73]:
# saving model in .keras (.h5 is being outdated)
model.save('diease_prediction_model.keras')

In [74]:
# predicting disease function 

def predict_disease(symptom_list, symptom_severity_df, model, label_encoder ):
    symptom_weights = []
    for symptom in symptom_list:
        symptom = symptom.strip().replace("_"," ")
        weight = symptom_severity_df[symptom_severity_df['Symptom'] == symptom]['weight'].values
        if weight.size > 0 :
            symptom_weights.append(weight[0])
        else:
            symptom_weights.append(0)
    
    input_vector = np.zeros(17)        
    for i, weight in enumerate(symptom_weights):
        if i < len(input_vector):
            input_vector[i] = weight
    input_vector = input_vector.reshape(1,-1)
    
    pred_probabilities = model.predict(input_vector)
    predicted_index = np.argmax(pred_probabilities, axis=1)
    predicted_disease = label_encoder.inverse_transform(predicted_index)
    return predicted_disease[0]

In [75]:
symptom_input = ["itching", "vomiting", "yellowish_skin", "nausea", "loss_of_appetite", "abdominal_pain"]
predicted_disease = predict_disease(symptom_input, weights_df, model, label_encoder)
print(f"Predicted Disease: {predicted_disease}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
Predicted Disease: Chronic cholestasis


In [76]:
#giving precaution list based on disease
precation_df = pd.read_csv('./dataset/raw/symptom_precaution.csv')

In [77]:
precation_df.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [78]:
from pandasql import sqldf
query = f""" SELECT precation_df.precaution_1, precation_df.precaution_2, precation_df.precaution_3, precation_df.precaution_4
FROM precation_df
WHERE disease = '{predicted_disease}' """

In [79]:
result_df = sqldf(query, locals())

In [80]:
type(result_df)

pandas.core.frame.DataFrame

In [81]:
result_df

Unnamed: 0,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,cold baths,anti itch medicine,consult doctor,eat healthy


In [82]:
result_list = result_df.loc[0,:].tolist()

In [83]:
final_result_list = [precaution for precaution in result_list if precaution is not None]

In [84]:
final_result_list

['cold baths', 'anti itch medicine', 'consult doctor', 'eat healthy']