üß† Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

üì• Loading and Previewing the Dataset

In [3]:
df=pd.read_csv('/content/Final_Augmented_dataset_Diseases_and_Symptoms.csv')
df

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699,vaginitis,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2700,vaginitis,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2701,vaginitis,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2702,vaginitis,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


üßπ Data Cleaning

In [4]:
df.columns = df.columns.str.strip()

In [6]:
df.dropna(inplace=True)

In [5]:
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")

Number of duplicated rows: 889


In [9]:
df.drop_duplicates(inplace=True)

In [11]:
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicated rows: {duplicate_rows.shape[0]}")

Number of duplicated rows: 0


üß™ Train-Test Split

In [18]:
x = df.drop('diseases', axis=1)
y = df['diseases']

In [19]:
x = pd.DataFrame(x, columns=df.drop('diseases', axis=1).columns)
y = pd.Series(y, name='diseases')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

 üîé Ensure all feature values are binary (0 or 1) only.

In [22]:
if ((x == 0) | (x == 1)).all().all():
    print("Not Found")
else:
    print("Found")


Not Found


üå≤ Training the Random Forest Classifier

In [23]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [24]:
model.fit(x_train, y_train)

In [25]:
y_pred = model.predict(x_test)

üß† Model Evaluation

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print(f"‚úÖ Accuracy: {accuracy * 100:.2f}%")

‚úÖ Accuracy: 98.90%


üß™ Sample Predictions: Comparing Model Output with Actual Diseases

In [27]:
sample_indices = np.random.choice(len(x_test), size=5, replace=False)

column_names = df.drop('diseases', axis=1).columns

results_df = pd.DataFrame(x_test.iloc[sample_indices], columns=column_names)
results_df['Actual'] = y_test.iloc[sample_indices].values
results_df['Predicted'] = model.predict(x_test.iloc[sample_indices])

print(results_df)

      anxiety and nervousness  depression  shortness of breath  \
1434                        0           0                    0   
67                          0           1                    1   
520                         1           0                    1   
69                          1           0                    0   
2648                        0           0                    0   

      depressive or psychotic symptoms  sharp chest pain  dizziness  insomnia  \
1434                                 0                 0          0         0   
67                                   1                 0          0         0   
520                                  0                 0          0         1   
69                                   0                 0          0         1   
2648                                 0                 0          0         0   

      abnormal involuntary movements  chest tightness  palpitations  ...  \
1434                               0    

 üîç Detecting Overfitting by Comparing Train and Test Accuracy

In [28]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"‚úÖ Train Accuracy: {train_acc * 100:.2f}%")
print(f"‚úÖ Test Accuracy : {test_acc * 100:.2f}%")

gap = train_acc - test_acc
if gap > 0.1:
    print("‚ö†Ô∏è  Possible overfitting detected.")
else:
    print("‚úÖ No major overfitting detected.")

‚úÖ Train Accuracy: 99.86%
‚úÖ Test Accuracy : 98.90%
‚úÖ No major overfitting detected.


üß† Disease-wise Evaluation and Doctor Alerts Based on Confidence

In [29]:
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

df_report = df_report.drop(['accuracy', 'macro avg', 'weighted avg'])

sample_threshold = 10
f1_threshold = 0.6

for disease, row in df_report.iterrows():
    f1 = row['f1-score']
    precision = row['precision']
    recall = row['recall']
    support = row['support']

    print("="*60)
    print(f"ü¶† Disease: {disease}")
    print(f"üìä F1-Score: {f1:.2f}")
    print(f"üéØ Precision: {precision:.2f}")
    print(f"üìà Recall: {recall:.2f}")
    print(f"üß™ Number of Samples: {int(support)}")

    if support < sample_threshold or f1 < f1_threshold:
        print("‚ö†Ô∏è Message:Doctor, please verify this result by yourself, due to low sample count.")
    else:
        print("‚úÖ Message: You can rely on this result. The model performed well and was trained on enough data.")


ü¶† Disease: atrophic vaginitis
üìä F1-Score: 0.98
üéØ Precision: 1.00
üìà Recall: 0.97
üß™ Number of Samples: 31
‚úÖ Message: You can rely on this result. The model performed well and was trained on enough data.
ü¶† Disease: cellulitis or abscess of mouth
üìä F1-Score: 1.00
üéØ Precision: 1.00
üìà Recall: 1.00
üß™ Number of Samples: 9
‚ö†Ô∏è Message:Doctor, please verify this result by yourself, due to low sample count.
ü¶† Disease: cryptorchidism
üìä F1-Score: 1.00
üéØ Precision: 1.00
üìà Recall: 1.00
üß™ Number of Samples: 1
‚ö†Ô∏è Message:Doctor, please verify this result by yourself, due to low sample count.
ü¶† Disease: eye alignment disorder
üìä F1-Score: 1.00
üéØ Precision: 1.00
üìà Recall: 1.00
üß™ Number of Samples: 61
‚úÖ Message: You can rely on this result. The model performed well and was trained on enough data.
ü¶† Disease: fracture of the hand
üìä F1-Score: 0.97
üéØ Precision: 0.95
üìà Recall: 1.00
üß™ Number of Samples: 18
‚úÖ Message: You can 