# 의사결정트리 ID3 구현
## 데이터 로드

In [1]:
import pandas as pd
import numpy as np

In [2]:
#https://www.kaggle.com/datasets/uom190346a/disease-symptoms-and-patient-profile-dataset
df = pd.read_csv('data.csv')
df.head(5)

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


## 정보 획득량
E(S)-E(S|X)끼리 비교할 때 E(S)가 동일하므로 제거해도 같은 결과가 나온다.

In [3]:
def IG_rel(df, attr, target):
    rig = 0
    cate_p = df.value_counts(attr, normalize=True)
    for index, item in cate_p.items():
        target_p = df[df[attr] == index].value_counts(target, normalize=True).to_numpy()
        rig += item*np.sum(target_p*np.log2(target_p))

    return rig

## 의사 결정 트리
재귀적으로 서브 트리를 생성하는 방식. 분류 또한 재귀적으로 수행된다.

In [105]:
class DTree:
    def __init__(self, df, target) -> None:
        self.target = df[target].value_counts().idxmax()
        if df.columns.size == 1 or df[target].nunique() == 1:
            self.column_name = None
            return
        ig_max = df.columns.to_series().drop(target).apply(lambda idx: IG_rel(df, idx, target)).idxmax()

        self.column_name = ig_max
        self.childs = {}
        for cat in np.unique(df[ig_max]):
            sub_df = df[df[ig_max] == cat].drop(columns=[ig_max])
            self.childs[cat] = DTree(sub_df, target)

    def classify(self, data):    
        if self.column_name is None:
            return self.target
        else:
            col = data[self.column_name].item() if type(data) is pd.DataFrame else data[self.column_name]
            return self.childs[col].classify(data) if col in self.childs else self.target

## 학습 및 검증

In [115]:
categ_df = df.drop(columns=['Age'])
target = 'Outcome Variable'

test_set = np.random.choice(df.index, 50, replace=False)
test_df = categ_df.loc[test_set]
train_df = categ_df.drop(test_set)

mtree = DTree(categ_df, target)

res = test_df.apply(lambda row: (mtree.classify(row) == row[target]), axis='columns')
print('정확도: {}%'.format(100*np.sum(res)/len(res)))

정확도: 100.0%
