<a href="https://colab.research.google.com/github/anandodayil/drug_classification/blob/main/drug_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Downloading data from "https://www.kaggle.com/datasets/prathamtripathi/drug-classification" to local drive
# Then uploading drug classification file to google drive
# Mounting google colab and google drive

from google.colab import drive
drive.mount('/content/drive')

# Importing data from google drive to google colab
# Importing pandas to create dataframe and numpy for array

import numpy as np
import pandas as pd
path = "/content/drive/MyDrive/Classification /Drug Classification/drug200.csv" # To get the path click on folder icone on the left side of the screen
drug_df = pd.read_csv(path)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Showing drug_df dataframe 

drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [None]:
# To check for empty values

print(drug_df.isna().sum())
print(drug_df.isnull().sum())

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64
Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64


In [None]:
# Lets divide the data into "Qualitative" and "Quantitative" columns
quali = ["Sex", "BP", "Cholesterol"]
quanti = ["Age", "Na_to_K"]

# In qualitative data "Sex" is Nominal data and "BP" and "Cholesterol" is Ordinal data
nominal = ["Sex"]
ordinal = ["BP", "Cholesterol"]

# In quantitative data "Age" is Discrete data and "Na_to_K" is Continuous data
discrete = ["Age"]
continuous = ["Na_to_K"]

In [None]:
# To check unique values in catogorical data

for column in quali:
    print(column)
    print(drug_df[column].value_counts())
    print("==============================")

Sex
M    104
F     96
Name: Sex, dtype: int64
BP
HIGH      77
LOW       64
NORMAL    59
Name: BP, dtype: int64
Cholesterol
HIGH      103
NORMAL     97
Name: Cholesterol, dtype: int64


In [None]:
# To convert Nominal data we use "OneHotEncoding" 

from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder()
oh_encoder.fit(drug_df[nominal])
oh_encoder_df = pd.DataFrame(oh_encoder.transform(drug_df[nominal]).toarray())

oh_encoder_df.rename(columns = {0 : "Female", 1 : "Male"}, inplace = True)

# And to convert Ordinal data we use "replace" ## if you know better method to do this please share

dict_1 = {"LOW" : 0, "NORMAL" : 1, "HIGH" : 2}
drug_df["BP"].replace(dict_1, inplace = True)

dict_2 = {"NORMAL" : 0, "HIGH" : 1}
drug_df['Cholesterol'].replace(dict_2, inplace = True)

In [None]:
# Creating X and y variable

X = drug_df.drop(['Drug', 'Sex'], axis = 1)
X = X.join(oh_encoder_df)
y = drug_df.Drug

In [None]:
X

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Female,Male
0,23,2,1,25.355,1.0,0.0
1,47,0,1,13.093,0.0,1.0
2,47,0,1,10.114,0.0,1.0
3,28,1,1,7.798,1.0,0.0
4,61,0,1,18.043,1.0,0.0
...,...,...,...,...,...,...
195,56,0,1,11.567,1.0,0.0
196,16,0,1,12.006,0.0,1.0
197,52,1,1,9.894,0.0,1.0
198,23,1,0,14.020,0.0,1.0


In [None]:
y

0      DrugY
1      drugC
2      drugC
3      drugX
4      DrugY
       ...  
195    drugC
196    drugC
197    drugX
198    drugX
199    drugX
Name: Drug, Length: 200, dtype: object

In [None]:
# To convert y to numerical values
from sklearn.preprocessing import LabelEncoder

l_encoder = LabelEncoder()
y = l_encoder.fit_transform(y)
np.unique(y, return_counts = True)

(array([0, 1, 2, 3, 4]), array([91, 23, 16, 16, 54]))

In [None]:
X.shape

(200, 6)

In [None]:
y.shape

(200,)

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut

model = DecisionTreeClassifier()


lpo_validation = LeavePOut(2) # C(n, p) really expensive to calculate
loo_validation = LeaveOneOut()
skfold_validation = StratifiedKFold(7)


from sklearn.model_selection import cross_val_score

result = cross_val_score(model, X, y, cv = skfold_validation)

print(result)

[1.         1.         1.         1.         0.96428571 0.96428571
 1.        ]


In [None]:
result.mean() *100

98.9795918367347

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)


In [None]:
X_scaled.shape

(200, 6)

In [None]:
model = svm.SVC()


lpo_validation = LeavePOut(2) # C(n, p) really expensive to calculate
loo_validation = LeaveOneOut()
skfold_validation = StratifiedKFold(7)


from sklearn.model_selection import cross_val_score

result = cross_val_score(model, X_scaled, y, cv = skfold_validation)

print(result)

[0.96551724 0.93103448 0.96551724 0.96551724 0.85714286 0.96428571
 0.96428571]


In [None]:
#Using pca to reduce features
from sklearn.decomposition import PCA

pca = PCA(n_components = 5)

X_pca = pca.fit_transform(X_scaled)

# X_pca = pca.transform(X)
# X_test_pca = pca.transform(X_test_scaled)

print(f"{X_pca.shape}")

(200, 5)


In [None]:
model = svm.SVC()


lpo_validation = LeavePOut(3) # C(n, p) really expensive to calculate
loo_validation = LeaveOneOut()
skfold_validation = StratifiedKFold(5)


from sklearn.model_selection import cross_val_score

result = cross_val_score(model, X_pca, y, cv = skfold_validation)

print(result)

[0.925 0.975 0.95  0.875 1.   ]


In [None]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"train Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"test Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(7)
# clf.fit(X_train_pca, y_train)

# clf = DecisionTreeClassifier()
# clf.fit(X_train_pca, y_train)

# clf = GaussianNB()
# clf.fit(X_train_pca, y_train)

# clf = KNeighborsClassifier(5)
# clf.fit(X_train_pca, y_train)

# clf = svm.SVC(C= 1, gamma= 0.1, kernel= 'rbf')
# clf.fit(X_train, y_train)

# clf = LogisticRegression(solver='liblinear')
# clf.fit(X_train_pca, y_train)

# print_score(clf, X_train_pca, y_train, X_test_pca, y_test, train=True)
# print_score(clf, X_train_pca, y_train, X_test_pca, y_test, train=False)

In [None]:

from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {"n_neighbors" : [x for x in range(5, 30)]
              
              }
 
grid = GridSearchCV( KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_scaled, y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................n_neighbors=5;, score=0.800 total time=   0.0s
[CV 2/5] END .....................n_neighbors=5;, score=0.800 total time=   0.0s
[CV 3/5] END .....................n_neighbors=5;, score=0.775 total time=   0.0s
[CV 4/5] END .....................n_neighbors=5;, score=0.775 total time=   0.0s
[CV 5/5] END .....................n_neighbors=5;, score=0.825 total time=   0.0s
[CV 1/5] END .....................n_neighbors=6;, score=0.775 total time=   0.0s
[CV 2/5] END .....................n_neighbors=6;, score=0.775 total time=   0.0s
[CV 3/5] END .....................n_neighbors=6;, score=0.800 total time=   0.0s
[CV 4/5] END .....................n_neighbors=6;, score=0.750 total time=   0.0s
[CV 5/5] END .....................n_neighbors=6;, score=0.875 total time=   0.0s
[CV 1/5] END .....................n_neighbors=7;, score=0.750 total time=   0.0s
[CV 2/5] END .....................n_neighbors=7

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                         16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                         26, 27, 28, 29]},
             verbose=3)

In [None]:
print(grid.best_params_)

{'n_neighbors': 8}


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut

model = DecisionTreeClassifier()


lpo_validation = LeavePOut(2) # C(n, p) really expensive to calculate
loo_validation = LeaveOneOut()
skfold_validation = StratifiedKFold(5)


from sklearn.model_selection import cross_val_score

result = cross_val_score(model, X_scaled, y, cv = skfold_validation)

print(result.mean())

0.985


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2)


# clf = RandomForestClassifier(7)
# clf.fit(X_train_pca, y_train)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# clf = GaussianNB()
# clf.fit(X_train, y_train)

# clf = KNeighborsClassifier(8)
# clf.fit(X_train, y_train)

# clf = svm.SVC(C= 100, gamma= 0.01, kernel= 'rbf')
# clf.fit(X_train, y_train)

# clf = LogisticRegression(solver='liblinear')
# clf.fit(X_train_pca, y_train)

print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
train Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0     1     2     3     4  accuracy  macro avg  weighted avg
precision   1.0   1.0   1.0   1.0   1.0       1.0        1.0           1.0
recall      1.0   1.0   1.0   1.0   1.0       1.0        1.0           1.0
f1-score    1.0   1.0   1.0   1.0   1.0       1.0        1.0           1.0
support    70.0  19.0  13.0  15.0  43.0       1.0      160.0         160.0
_______________________________________________
Confusion Matrix: 
 [[70  0  0  0  0]
 [ 0 19  0  0  0]
 [ 0  0 13  0  0]
 [ 0  0  0 15  0]
 [ 0  0  0  0 43]]

Test Result:
test Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
              0    1    2    3     4  accuracy  macro avg  weighted avg
precision   1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
recall      1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
f1-score    1.0  1.0 

In [None]:
a = np.array([[23, "F", "LOW", 'NORMAL', 20]])
dff = pd.DataFrame(a)

In [None]:

oh_encoder_df = pd.DataFrame(oh_encoder.transform(dff[[1]]).toarray())

oh_encoder_df.rename(columns = {0 : "Female", 1 : "Male"}, inplace = True)


  "X does not have valid feature names, but"


In [None]:
dff.drop([1], axis = 1, inplace = True)

In [None]:
dff = dff.join(oh_encoder_df)

In [None]:

dict_1 = {"LOW" : 0, "NORMAL" : 1, "HIGH" : 2}
dff[2].replace(dict_1, inplace = True)

dict_2 = {"NORMAL" : 0, "HIGH" : 1}
dff[3].replace(dict_2, inplace = True)

In [None]:
j = scaler.transform(dff)

  "X does not have valid feature names, but"


In [None]:
j

array([[-1.29159102, -1.2722144 , -1.03046381,  0.5433783 ,  1.040833  ,
        -1.040833  ]])

In [None]:
clf.predict(j)

array([0])