In [22]:
# Cell 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle
import os


In [23]:
# Cell 2: Load the dataset
df = pd.read_csv("Disease and symptoms dataset.csv")
print(df.shape)
df.head()

(246945, 378)


Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Cell 3: Separate features and target label
# First column is disease label
y = df.iloc[:, 0].values

# Remaining columns are binary symptom features
X = df.iloc[:, 1:].values

# Get symptom/feature names
symptom_names = list(df.columns[1:])
print(f"Total symptoms/features: {len(symptom_names)}")


Total symptoms/features: 377


In [25]:
# Cell 4: Encode labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Total classes: {len(le.classes_)}")
print(f"Sample encoded labels: {y_encoded[:10]}")


Total classes: 773
Sample encoded labels: [531 531 531 531 531 531 531 531 531 531]


In [26]:
# Cell 5: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42)
print(f"Train samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


Train samples: 197556, Test samples: 49389


In [27]:
# Sample 30% of training data
X_sub, _, y_sub, _ = train_test_split(X_train, y_train, train_size=0.3, random_state=42)

model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42, n_jobs=-1)
model.fit(X_sub, y_sub)


0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
y_pred = model.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.4f}")
print(
    classification_report(
        y_test, y_pred,
        labels=range(len(le.classes_)),
        target_names=le.classes_
    )
)


Accuracy on test set: 0.6084
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.00      0.00      0.00        20
                                        abdominal hernia       1.00      0.65      0.79        81
                                         abscess of nose       1.00      0.62      0.77        58
                                     abscess of the lung       0.00      0.00      0.00         6
                                  abscess of the pharynx       0.00      0.00      0.00        63
                                    acanthosis nigricans       0.00      0.00      0.00         6
                                               acariasis       0.00      0.00      0.00         5
                                               achalasia       0.00      0.00      0.00        20
                                                    acne       0.82      0.39      0.53 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [29]:
os.makedirs("Training/new/", exist_ok=True)

with open('Training/new/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('Training/new/symptoms.pkl', 'wb') as f:
    pickle.dump(symptom_names, f)

with open('Training/new/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Model and artifacts saved successfully!")

Model and artifacts saved successfully!
