In [10]:
import numpy as np 
import pandas as pd 
 
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,export_graphviz

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix ,f1_score


In [11]:
data_path =os.path.join('..','data')
df = pd.read_csv(os.path.join(data_path, 'drug200.csv'))
df


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [12]:
print(df['Cholesterol'].value_counts())
print('-----------------------------')
print(df['BP'].value_counts())
print('-----------------------------')
print(df['Sex'].value_counts())


Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64
-----------------------------
BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64
-----------------------------
Sex
M    104
F     96
Name: count, dtype: int64


In [13]:
df.describe()    

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [14]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    str    
 2   BP           200 non-null    str    
 3   Cholesterol  200 non-null    str    
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    str    
dtypes: float64(1), int64(1), str(4)
memory usage: 9.5 KB


In [15]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [16]:
y = df["Drug"]
X = df.drop("Drug", axis=1)


In [17]:
numeric_cols = ['Age', 'Na_to_K']
categorical_cols = ['Sex', 'BP', 'Cholesterol']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42
)

In [18]:


num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, numeric_cols),
    ("cat", cat_pipeline, categorical_cols)
])

model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

model_pipeline.fit(x_train, y_train)

In [19]:
 

y_pred = model_pipeline.predict(x_test)

 

print("Accuracy:", accuracy_score(y_test, y_pred))



Accuracy: 1.0


In [20]:
joblib.dump(model_pipeline, os.path.join('..','models','drug_classifier.pkl'))

['..\\models\\drug_classifier.pkl']

In [21]:
prd = model_pipeline.predict(x_test.head(1))
prd[0]

'drugX'

In [25]:
 
from utils.DrugData import DrugValidData

data = DrugValidData(
    Age=25,
    Sex="M",
    BP="NORMAL",
    Cholesterol="NORMAL",
    Na_to_K=12.0
)

data_new = pd.DataFrame([data.model_dump()] )
data_new
   


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,25,M,NORMAL,NORMAL,12.0


In [26]:
from utils.config import model_drug_classifier
prd = model_drug_classifier.predict(data_new)
prd[0]

'drugX'