#Drug Classification

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [40]:
df = pd.read_csv('drug200.csv')

In [41]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


column names:
* Age : age of the patient
* Sex : sex of the patient
* BP : blood pressure level
* Cholesterol : cholesterol level
* Na_to_K : sodium to potasium ratio in blood
* Drug : drug type


In [42]:
df.shape

(200, 6)

In [43]:
df.isnull().value_counts()

Age    Sex    BP     Cholesterol  Na_to_K  Drug 
False  False  False  False        False    False    200
dtype: int64

In [44]:
df.Drug.unique()

array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

####Econoding string variables to numerical variables

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Cholesterol'] = le.fit_transform(df['Cholesterol'])
df['Na_to_K'] = le.fit_transform(df['Na_to_K'])
df['BP'] = le.fit_transform(df['BP'])
df['Drug'] = le.fit_transform(df['Drug'])
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,167,0
1,47,1,1,0,89,3
2,47,1,1,0,43,3
3,28,0,2,0,10,4
4,61,0,1,0,133,0


####Splitting the data into training and testing set to evaluate our models

In [46]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

###K-Nearest Neighbor classifier

In [47]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors = 6)
kn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = kn.predict(x_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

[[17  1  1  0  0]
 [ 0  2  0  2  1]
 [ 0  0  2  0  2]
 [ 0  0  1  0  0]
 [ 0  4  0  1  6]]
0.675


this accuracy score varies due to the randomness of the split of our dataset into training and testing set.

Using Cross Validation to find the average R² of the model

In [49]:
from sklearn.model_selection import cross_val_score
rscore = cross_val_score(kn, X, Y , cv = 10)
print(rscore)
print(rscore.mean())

[0.65 0.6  0.65 0.5  0.7  0.55 0.65 0.55 0.55 0.55]
0.595


to find the best parameters for our model we will use grid search.

In [50]:
from sklearn.model_selection import GridSearchCV
params = [{'n_neighbors': [4,5,6,7,8,10,11,12]}]
gs = GridSearchCV(kn, params, cv = 10)
gs.fit(X, Y)
g = gs.best_estimator_
print(g)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')


In [51]:
knc = KNeighborsClassifier(n_neighbors = 7)
knc.fit(x_train, y_train)
rscore = cross_val_score(knc, X, Y , cv = 10)
print(rscore.mean())

0.6599999999999999


###Support Vector Machines classifier

In [52]:
from sklearn.svm import SVC
sc = SVC(kernel = 'rbf')
sc.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [53]:
y_pred = sc.predict(x_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

[[17  1  1  0  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  6  3  3  8]]
0.625


In [62]:
param = [{'kernel':['linear', 'sigmoid', 'rbf']}]
gsc = GridSearchCV(sc, param, cv = 10)
gsc.fit(X, Y)
gsc.best_estimator_

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [63]:
svc = SVC(kernel = 'linear')
rscore = cross_val_score(svc, X, Y , cv = 10)
print(rscore.mean())

0.9749999999999999


###Decision Tree Classifier

In [54]:
from sklearn.tree import DecisionTreeClassifier
tc = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
tc.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [55]:
y_pred = tc.predict(x_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

[[17  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0  4  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0  9]]
1.0


###Conclusion
As we can see the decision tree ans SVM models do an amazing job at classifying patients based on their features to predict what drug will be recommended to them, however decision tree classifier is best as it perfectley predicts the dependent variable.