In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [None]:
# Load Dataset 
df = pd.read_csv("DiseaseAndSymptoms.csv")
df.head

In [None]:
# Combine all symptom columns into one list per row
symptom_cols = [f'Symptom_{i}' for i in range(1, 18)]
df['all_symptoms'] = df[symptom_cols].values.tolist()
# Each row now has a list of symptom values (with NaN)
df['all_symptoms'] = df['all_symptoms'].apply(lambda x: [str(i).strip() for i in x if pd.notna(i)])
df.head()


In [None]:
#  Build the symptom vocabulary (all unique symptoms)
all_symptoms_flat = [symptom for sublist in df['all_symptoms'] for symptom in sublist]
unique_symptoms = sorted(list(set(all_symptoms_flat)))
print(f"Total unique symptoms: {len(unique_symptoms)}")


In [None]:
#  Create binary feature vector for symptoms presence
def symptoms_to_vector(symptom_list):
    return [1 if symptom in symptom_list else 0 for symptom in unique_symptoms]

df['symptom_vector'] = df['all_symptoms'].apply(symptoms_to_vector)
df.head()


In [None]:
#  Prepare feature matrix X and target vector y
X = np.array(df['symptom_vector'].tolist())
le = LabelEncoder()
y = le.fit_transform(df['Disease'])


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
# Create the Training directory if it doesn't exist
import os
os.makedirs("Training", exist_ok=True)


In [None]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("symptoms.pkl", "wb") as f:
    pickle.dump(unique_symptoms, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)
