<a href="https://colab.research.google.com/github/aasmik/Identifying-Diseases-from-Medication-History/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#DATA COLLECTION AND PREPROCESSING
import pandas as pd
import numpy as np
#LOAD THE EXCEL FILE INTO DATA FRAME
file_path = 'MEDICAL_DATASET.xlsx'
df = pd.read_excel(file_path)
#DISPLAY DATASET STRUCTURE INCLUDING NO OF NON NULL VALUES AND DATA TYPES FOR EACH COL
print("Initial Dataset Info:")
print(df.info())
#DISPLAYS 1ST 5 ROWS TO GET THE PREVIEW OF THE DATA
print("\nFirst few rows:")
print(df.head())
#REMOVES ANY ROWS WITH MISSING VALUES USING dropna()
df_cleaned = df.dropna()
#TO REMOVE DUPLICATE ROWS TO AVOID DATA REDUNDANCY
df_cleaned = df_cleaned.drop_duplicates()
#STANDARDIZES COL NAMES BY STRIPPING THE EXTRA SPACES,CONVERTING TO LOWERCASE,REPLACING SPACES WITH UNDERSCORE
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')
#SAVES THE CLEANED DATASET AS A CSV FILE WITHOUT ROW INDICES
df_cleaned.to_csv('MEDICINE_DATASET.csv', index=False)
# SAVE THE DATASET
print("\nMEDICINE_DATASET as 'MEDICINE_DATASET.csv'")





Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          100 non-null    object
 1   Category      100 non-null    object
 2   Dosage Form   100 non-null    object
 3   Strength      100 non-null    object
 4   Manufacturer  100 non-null    object
 5   Indication    100 non-null    object
dtypes: object(6)
memory usage: 4.8+ KB
None

First few rows:
           Name      Category Dosage Form Strength           Manufacturer  \
0      Metophen    Antifungal      Tablet   346 mg      Merck & Co., Inc.   
1     Cefcillin  Antidiabetic    Ointment   517 mg       Roche Holding AG   
2    Ibuprophen    Antifungal    Ointment   967 mg            AbbVie Inc.   
3  Ibupronazole    Antiseptic       Cream   747 mg  Eli Lilly and Company   
4   Amoxicillin     Analgesic       Cream   594 mg             Amgen Inc.   

   Indication 

In [31]:
#MODEL DEVELOPMENT
#IMPORTS REQUIRED MODULES FROM SCIKIT-LEARN
#TRAIN_TEST_SPLIT: SPLITS DATA INTO TRAINING AND TESTING SETS
#LABELENCODER: CONVERTS CATEGORICAL TEXT INTO NUMBERS
#RANDOMFORESTCLASSIFIER, DECISIONTREECLASSIFIER, SVC: MACHINE LEARNING MODELS
#CLASSIFICATION_REPORT, ACCURACY_SCORE: TO EVALUATE MODEL PERFORMANCE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
#READ THE CLEANED DATASET
df = pd.read_csv('MEDICINE_DATASET.csv')
#ENCODES ALL CATEGORICAL(TEXT) COL INTO NUMERIC FORMAT
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
#SPLIT THE DATA INTO X AND Y
X = df.drop('indication', axis=1)
y = df['indication']
#  APPLY SMOTE TO BALANCE THE DATA
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# CONFIRM SMOTE WORKED PROPERLY
print("AFTER RESAMPLING (CLASS COUNTS):")
print(pd.Series(y_resampled).value_counts())
# SPLIT DATA INTO TRAINING AND TESTING SETS
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# TRAIN A RANDOM FOREST MODEL
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
# EVALUATE RANDOM FOREST
print("\nRANDOM FOREST REPORT:\n")
print(classification_report(y_test, rf_preds, zero_division=0))
print("ACCURACY SCORE:", accuracy_score(y_test, rf_preds))







AFTER RESAMPLING (CLASS COUNTS):
indication
1    21
5    21
0    21
3    21
6    21
7    21
2    21
4    21
Name: count, dtype: int64

RANDOM FOREST REPORT:

              precision    recall  f1-score   support

           0       0.57      0.67      0.62         6
           1       1.00      0.40      0.57         5
           2       0.67      1.00      0.80         2
           3       0.75      0.75      0.75         4
           4       0.57      1.00      0.73         4
           5       0.00      0.00      0.00         6
           6       0.29      1.00      0.44         2
           7       1.00      0.60      0.75         5

    accuracy                           0.59        34
   macro avg       0.61      0.68      0.58        34
weighted avg       0.61      0.59      0.55        34

ACCURACY SCORE: 0.5882352941176471
