In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [5]:

# Step 1: Create a synthetic dataset
data = {
    "Symptom1": np.random.randint(0, 2, 100),
    "Symptom2": np.random.randint(0, 2, 100),
    "Symptom3": np.random.randint(0, 2, 100),
    "Disease": np.random.choice(["Infection", "Allergy", "Diabetes"], 100),
    "Recommended_Medicine": np.random.choice(["MedicineA", "MedicineB", "MedicineC"], 100),
}

df = pd.DataFrame(data)


In [6]:

# Step 2: Encode categorical variables
df["Disease"] = df["Disease"].astype("category").cat.codes
df["Recommended_Medicine"] = df["Recommended_Medicine"].astype("category").cat.codes

# Step 3: Split into features (X) and target (y)
X = df[["Symptom1", "Symptom2", "Symptom3", "Disease"]]
y = df["Recommended_Medicine"]


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [17]:

# Step 4: Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.40


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate the model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters:", grid_search.best_params_)

# Use the best estimator
best_rf_model = grid_search.best_estimator_


Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [19]:
# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.32
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.17      0.25        12
           1       0.31      0.67      0.42         6
           2       0.25      0.29      0.27         7

    accuracy                           0.32        25
   macro avg       0.35      0.37      0.31        25
weighted avg       0.38      0.32      0.30        25

Confusion Matrix:
[[2 5 5]
 [1 4 1]
 [1 4 2]]


In [20]:

# Step 6: Save the model for later use
import pickle
with open("medicine_prediction_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model trained and saved successfully!")


Model trained and saved successfully!
