In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [87]:
data = pd.read_csv('dataset.csv')
data.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
5,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
6,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
7,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
8,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
9,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,


In [88]:
#symptom_columns = [col for col in data.columns if 'Symptom' in col]
symptom_columns = ['Symptom_1', 'Symptom_2', 'Symptom_3']
data = data[symptom_columns + ['Disease']]
data[symptom_columns] = data[symptom_columns].fillna('no_symptom')
data.head(10)

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Disease
0,itching,skin_rash,nodal_skin_eruptions,Fungal infection
1,skin_rash,nodal_skin_eruptions,dischromic _patches,Fungal infection
2,itching,nodal_skin_eruptions,dischromic _patches,Fungal infection
3,itching,skin_rash,dischromic _patches,Fungal infection
4,itching,skin_rash,nodal_skin_eruptions,Fungal infection
5,skin_rash,nodal_skin_eruptions,dischromic _patches,Fungal infection
6,itching,nodal_skin_eruptions,dischromic _patches,Fungal infection
7,itching,skin_rash,dischromic _patches,Fungal infection
8,itching,skin_rash,nodal_skin_eruptions,Fungal infection
9,itching,skin_rash,nodal_skin_eruptions,Fungal infection


In [97]:
le = LabelEncoder()
data['Disease'] = le.fit_transform(data['Disease'])
data

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Disease
0,itching,skin_rash,nodal_skin_eruptions,15
1,skin_rash,nodal_skin_eruptions,dischromic _patches,15
2,itching,nodal_skin_eruptions,dischromic _patches,15
3,itching,skin_rash,dischromic _patches,15
4,itching,skin_rash,nodal_skin_eruptions,15
...,...,...,...,...
4915,vomiting,headache,nausea,0
4916,skin_rash,pus_filled_pimples,blackheads,2
4917,burning_micturition,bladder_discomfort,foul_smell_of urine,38
4918,skin_rash,joint_pain,skin_peeling,35


In [90]:
X = data[symptom_columns]
y = data['Disease']

In [91]:
X_encoded = pd.get_dummies(X, columns=symptom_columns)
X_encoded

Unnamed: 0,Symptom_1_ acidity,Symptom_1_ back_pain,Symptom_1_ bladder_discomfort,Symptom_1_ breathlessness,Symptom_1_ burning_micturition,Symptom_1_ chest_pain,Symptom_1_ chills,Symptom_1_ constipation,Symptom_1_ continuous_sneezing,Symptom_1_ cough,...,Symptom_3_ stomach_pain,Symptom_3_ sweating,Symptom_3_ swelling_joints,Symptom_3_ swelling_of_stomach,Symptom_3_ ulcers_on_tongue,Symptom_3_ vomiting,Symptom_3_ watering_from_eyes,Symptom_3_ weakness_of_one_body_side,Symptom_3_ weight_loss,Symptom_3_ yellowish_skin
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4916,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4917,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4918,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [92]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [93]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [94]:
y_pred = model.predict(X_test)

In [95]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 95.22%


In [96]:
# joblib.dump(model, 'disease_prediction_model.pkl')
# joblib.dump(le, 'label_encoder.pkl')