### Dicision Tree Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

In [1]:
import pandas as pd

drug_df = pd.read_csv('./datasets/drugs.csv')
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
drug_df.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [3]:
drug_df['Sex'] = drug_df['Sex'].replace({'F': 0, 'M': 1})
drug_df['BP'] = drug_df['BP'].replace({'HIGH': 2, 'NORMAL': 1, 'LOW': 0})
drug_df['Cholesterol'] = drug_df['Cholesterol'].replace({'HIGH': 2, 'NORMAL': 1, 'LOW': 0})
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,2,2,25.355,drugY
1,47,1,0,2,13.093,drugC
2,47,1,0,2,10.114,drugC
3,28,0,1,2,7.798,drugX
4,61,0,0,2,18.043,drugY
...,...,...,...,...,...,...
195,56,0,0,2,11.567,drugC
196,16,1,0,2,12.006,drugC
197,52,1,1,2,9.894,drugX
198,23,1,1,1,14.020,drugX


### 데이터 분리

In [4]:
target = drug_df.iloc[:, -1]
print(type(target))
target

<class 'pandas.core.series.Series'>


0      drugY
1      drugC
2      drugC
3      drugX
4      drugY
       ...  
195    drugC
196    drugC
197    drugX
198    drugX
199    drugX
Name: Drug, Length: 200, dtype: object

In [5]:
feature = drug_df.iloc[:, :-1]
feature

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,2,2,25.355
1,47,1,0,2,13.093
2,47,1,0,2,10.114
3,28,0,1,2,7.798
4,61,0,0,2,18.043
...,...,...,...,...,...
195,56,0,0,2,11.567
196,16,1,0,2,12.006
197,52,1,1,2,9.894
198,23,1,1,1,14.020


In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

dtc = DecisionTreeClassifier()

x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
print(x_train)
print(x_test)
print(y_train)
print(y_test)
dtc.fit(x_train.values, y_train)

     Age  Sex  BP  Cholesterol  Na_to_K
26    31    1   2            2   30.366
176   48    1   2            1   10.446
187   47    1   2            2   10.403
101   45    0   2            2   12.854
138   51    1   2            1   11.343
..   ...  ...  ..          ...      ...
122   34    1   1            2   22.456
177   25    1   1            2   19.011
20    57    1   0            1   19.128
82    32    0   0            2    9.712
173   41    0   0            1   18.739

[150 rows x 5 columns]
     Age  Sex  BP  Cholesterol  Na_to_K
128   47    1   0            1   33.542
104   22    1   2            1   28.294
38    39    0   1            1    9.709
50    58    0   2            2   19.416
110   50    1   2            2    7.490
149   22    1   0            2    8.151
18    23    1   0            2    7.298
178   39    1   1            2   15.969
189   64    1   2            1   20.932
120   28    1   1            2   27.064
152   55    1   1            1    7.261
157   53    1   

In [7]:
dtc.predict(x_test.values)

array(['drugC', 'drugX', 'drugY', 'drugX', 'drugY', 'drugX', 'drugA',
       'drugX', 'drugY', 'drugX', 'drugC', 'drugX', 'drugY', 'drugA',
       'drugX', 'drugY', 'drugY', 'drugY', 'drugX', 'drugY', 'drugC',
       'drugA', 'drugX', 'drugY', 'drugY', 'drugY', 'drugY', 'drugY',
       'drugY', 'drugA', 'drugX', 'drugX', 'drugY', 'drugY', 'drugY',
       'drugX', 'drugY', 'drugX', 'drugY', 'drugC', 'drugX', 'drugY',
       'drugY', 'drugX', 'drugY', 'drugY', 'drugY', 'drugY', 'drugX',
       'drugX'], dtype=object)

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, dtc.predict(x_test.values))

1.0

In [31]:
feature_names = drug_df.columns.drop('Drug').tolist()
feature_importances = dtc.feature_importances_
print(feature_names, feature_importances)

['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K'] [0.14494398 0.         0.28200527 0.11717576 0.45587499]


In [10]:
for name, value in zip(iris.feature_names,dtc.feature_importances_):
    print(f'{name}: {round(value * 100, 2)}')

sepal length (cm): 16.27
sepal width (cm): 0.0
petal length (cm): 27.42
petal width (cm): 11.24


In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.barplot(x=dtc.feature_importances_, y=iris.feature_names)
plt.show()

  order = pd.unique(vector)


KeyError: 'sepal length (cm)'

In [None]:
import numpy as np

# Classifier의 Decision Boundary를 시각화 하는 함수
def visualize_boundary(model, X, y):
    fig,ax = plt.subplots()
    
    # 학습 데이타 scatter plot으로 나타내기
    ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k',
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    xlim_start , xlim_end = ax.get_xlim()
    ylim_start , ylim_end = ax.get_ylim()
    
    # 호출 파라미터로 들어온 training 데이타로 model 학습 . 
    model.fit(X, y)
    # meshgrid 형태인 모든 좌표값으로 예측 수행. 
    xx, yy = np.meshgrid(np.linspace(xlim_start,xlim_end, num=200),np.linspace(ylim_start,ylim_end, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    # contourf() 를 이용하여 class boundary 를 visualization 수행. 
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap='rainbow',
                           zorder=1)