<a href="https://colab.research.google.com/github/aasmik/Medicine-Disease-Predictor/blob/main/data_processing%26model_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib

print("Script started.")

# File path to your dataset (change if needed)
file_path = r"/content/MEDICAL_DATASET.csv"

# 1. Check if dataset file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset file not found at: {file_path}")
print(f"Dataset found at: {file_path}")

# 2. Load dataset
df = pd.read_csv(file_path)
print("\nInitial Dataset Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())

# 3. Drop missing values and duplicates
df_cleaned = df.dropna().drop_duplicates()

# 4. Standardize column names
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')

# 5. Convert 'strength' column to numeric (extract digits only)
if 'strength' in df_cleaned.columns:
    df_cleaned['strength'] = df_cleaned['strength'].astype(str).str.extract(r'(\d+)', expand=False)
    df_cleaned['strength'] = pd.to_numeric(df_cleaned['strength'], errors='coerce').fillna(0)

# 6. Save cleaned dataset
os.makedirs('data', exist_ok=True)
df_cleaned.to_csv("data/cleaned_medicine_dataset.csv", index=False)
print("\nCleaned data saved to 'data/cleaned_medicine_dataset.csv'")

# 7. Encode categorical columns
X = df_cleaned.drop(columns=['indication'])
y = df_cleaned['indication']

label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Encode target separately
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 9. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 10. Evaluate model
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print(f"\nTraining Accuracy: {accuracy_score(y_train, train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, test_pred):.4f}")

print("\nClassification Report (Test Data):")

# Important fix: get unique labels in y_test & use them in classification_report
unique_labels = np.unique(y_test)

# classification_report needs both labels & target_names arrays to match exactly in length
print(classification_report(y_test, test_pred, labels=unique_labels, target_names=le_target.classes_[unique_labels], zero_division=0))


# 11. Save model and encoders
os.makedirs('model', exist_ok=True)
joblib.dump(model, 'model/random_forest_model.joblib')
joblib.dump(label_encoders, 'model/label_encoders_dict.joblib')
joblib.dump(le_target, 'model/label_encoder_target.joblib')

print("\nModel and encoders saved in 'model/' folder.")
print("Script finished.")


Script started.
Dataset found at: /content/MEDICAL_DATASET.csv

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          201 non-null    object
 1   Category      201 non-null    object
 2   Dosage Form   201 non-null    object
 3   Strength      201 non-null    object
 4   Manufacturer  201 non-null    object
 5   Indication    201 non-null    object
dtypes: object(6)
memory usage: 9.6+ KB
None

First few rows:
           Name      Category Dosage Form Strength           Manufacturer  \
0      Metophen    Antifungal      Tablet   346 mg      Merck & Co., Inc.   
1     Cefcillin  Antidiabetic    Ointment   517 mg       Roche Holding AG   
2    Ibuprophen    Antifungal    Ointment   967 mg            AbbVie Inc.   
3  Ibupronazole    Antiseptic       Cream   747 mg  Eli Lilly and Company   
4   Amoxicillin     Analges