In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from category_encoders import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

In [None]:
dataset = pd.read_csv("drug200.csv")

In [None]:
dataset.isnull().sum()

In [None]:
dataset.head()

In [None]:
dataset.describe(include="all")

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(dataset.drop('Drug', axis=1), annot=True)
plt.show()

In [None]:
ds_idx = dataset.set_index(["Na_to_K", "Age", "Drug"])

for name, level in zip(ds_idx.index.names, ds_idx.index.levels):
    print(name,":",level)

In [None]:
dataset.info()

In [None]:
idx_slice = dataset.set_index(["Sex", "BP"])
idx = pd.IndexSlice

idx_slice.sort_index().loc[idx["M", "HIGH"], :]

In [None]:
print(dataset.groupby('BP').count())
print(dataset.groupby('Cholesterol').count())

In [None]:
LE = LabelEncoder()
dataset['Sex'] = LE.fit_transform(dataset['Sex'])

In [None]:
dict_bp = [{'col': 'BP', 'mapping': {'LOW': 1, 'NORMAL': 2, 'HIGH': 3}}]

oe_bp = OrdinalEncoder(cols='BP', mapping=dict_bp)
dataset = oe_bp.fit_transform(dataset)

In [None]:
dict_chol = [{'col': 'Cholesterol', 'mapping': {'NORMAL': 1, 'HIGH': 2}}]

oe_chol = OrdinalEncoder(cols='BP', mapping=dict_chol)
dataset = oe_chol.fit_transform(dataset)

In [None]:
dataset[['Na_to_K']] = MinMaxScaler().fit_transform(dataset[['Na_to_K']])

In [None]:
X_raw = dataset.drop('Drug', axis=1)
y_raw = dataset['Drug']

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.25, random_state=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.get_params()

In [None]:
from sklearn.naive_bayes import GaussianNB

gauss = GaussianNB()
gauss.get_params()

In [None]:
grid = GridSearchCV(gauss, param_grid=dict(var_smoothing=np.logspace(-9, -1, 10)), n_jobs=-1)
model = grid.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [101]:
X_test.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
18,23,1,1,2,0.032178
170,28,0,2,2,0.206705
107,42,1,1,2,0.429795
98,20,1,3,1,0.918444
177,25,1,2,2,0.398461


In [None]:
acc = accuracy_score(prediction, y_test)
print("Score model: {}".format(acc))

In [None]:
scores = cross_validate(model, X_raw, y_raw, cv=5, scoring='accuracy')
scores

In [None]:
from yellowbrick.classifier import ConfusionMatrix

cm = ConfusionMatrix(model)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)
cm.show()

In [None]:
from yellowbrick.classifier import ClassificationReport

cr = ClassificationReport(model, support=True)
cr.fit(X_train, y_train)
cr.score(X_test, y_test)
cr.show()

In [113]:
new_dataset = np.array([61, 1, 1, 2, 0.9])

dtc.predict(new_dataset.reshape(1, -1))

array(['DrugY'], dtype=object)