In [252]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


# Load the dataset
data = pd.read_csv("training.csv")
# print(data)

In [253]:
# Separate features (X) and target variable (y)
X = data.iloc[:, :-1]  # Select all columns except the last one
y = data.iloc[:, -1]   # Select only the last column

# print("Symptoms (X):")
# print(X)
# print("\nPrognosis (y):")
# print(y)

# print(set(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

clf = DecisionTreeClassifier(ccp_alpha=0.01)
clf = clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(predictions)

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({'Predictions': predictions})

# Concatenate predictions with X_test values and replace 1s with corresponding symptom names
result_df = pd.concat([predictions_df, X_test.apply(lambda x: ', '.join(X.columns[x == 1]), axis=1).reset_index(drop=True)], axis=1)

# Save the concatenated DataFrame to a CSV file
result_df.to_csv('predictions_with_features.csv', index=False)

# Print a message to confirm
print("Predictions with features saved to predictions_with_features.csv")

['Hepatitis B' 'Varicose veins' 'Alcoholic hepatitis' ...
 'Hyperthyroidism' '(vertigo) Paroymsal  Positional Vertigo'
 'Osteoarthristis']
Predictions with features saved to predictions_with_features.csv


In [254]:
print(clf.predict_proba(X_test))

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(classification_report(y_test, predictions, target_names=set(y)))

feature_names = list(X.columns)
feature_importance = pd.DataFrame(clf.feature_importances_, index = feature_names).sort_values(0, ascending=False)
print(feature_importance)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Accuracy: 0.9375
                                         precision    recall  f1-score   support

           Paralysis (brain hemorrhage)       1.00      0.98      0.99        42
                            Hepatitis E       1.00      0.89      0.94        44
                           Heart attack       1.00      0.85      0.92        46
                          Drug Reaction       1.00      0.93      0.96        42
                       Bronchial Asthma       1.00      0.86      0.92        35
                                Allergy       1.00      0.94      0.97        33
                               Migraine       1.00      0.94      0.97        52
                    Chronic cholestasis       0.91      0.95      0.93        43
                   Cervical spondylosis       1.00      1.00      1.00        39
                

In [282]:
# Symptoms provided by the user

user_symptoms = ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes', 'rusty_sputum']

# Initialize user_input dictionary with all symptoms set to 0
user_input = {symptom: [0] for symptom in X.columns}

# Set values to 1 for symptoms provided by the user
for symptom in user_symptoms:
    if symptom in user_input:
        user_input[symptom] = [1]

# Convert user_input to DataFrame
user_df = pd.DataFrame(user_input)

# Make predictions using the trained model
predicted_disease = clf.predict(user_df)

# Display the predicted output to the user
print("Based on the provided symptoms, the predicted disease is:", predicted_disease)

Based on the provided symptoms, the predicted disease is: ['Pneumonia']


In [283]:
# Have to get metrics for user input prediction