## 엔트로피 
### 1. Entorpy 
- 엔트로피 공식
    - T = p+ + p- (p+는 목표 대상 갯수)
    - E(p+, p-) = - p+ * log(p+ / T) - p- * log(p- / T)
### 1. Information Gain 
- information gain이 가장 큰 기준으로 의사결정트리 선정
    - 전체 Entropy 공식 - 의사결정 상황에서의 Entropy 공식

In [3]:
import pandas as pd
import numpy as np
print(pd.__version__)

0.24.2


In [2]:
data = pd.DataFrame({"no_insects":["True","True","True","False","True","True","True","True","True","False"],
                     "no_dead":["True","True","False","True","True","True","False","False","True","False"],
                     "no_wilting":["True","True","True","True","True","True","False","True","True","True"],
                     "no_diseases":["True","True","False","True","True","True","False","False","True","True"],
                     "tree_health":["Good","Good","Poor","Good","Good","Good","Poor","Poor","Good","Poor"]}, 
                    columns=["no_insects","no_dead","no_wilting","no_diseases","tree_health"])
# 기술 속성(descriptive features)
features = data[["no_insects","no_dead","no_wilting","no_diseases"]]
# 대상 속성(target feature)
target = data["tree_health"]
print(data)


  no_insects no_dead no_wilting no_diseases tree_health
0       True    True       True        True        Good
1       True    True       True        True        Good
2       True   False       True       False        Poor
3      False    True       True        True        Good
4       True    True       True        True        Good
5       True    True       True        True        Good
6       True   False      False       False        Poor
7       True   False       True       False        Poor
8       True    True       True        True        Good
9      False   False       True        True        Poor


In [10]:
# 엔트로피
def entropy(target_col) :
    elements, counts = np.unique(target_col, return_counts = True)
    print('elements : ', elements)
    print('counts : ', counts)
    entropy = -np.sum([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [11]:
print('H(x) = ', round(entropy(target), 5))

elements :  ['Good' 'Poor']
counts :  [6 4]
H(x) =  0.97095


In [24]:
# information gain
def InfoGain(data, split_attribute_name, target_name) :
    
    # 전체 엔트로피 계산
    total_entropy = entropy(data[target_name])
    print("Entropy(D) = ", round(total_entropy, 5))
    
    # 가중 엔트로피 계산
    vals, counts = np.unique(data[split_attribute_name], return_counts = True)
    print('vals : ', vals)
    print('counts W: ', counts)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))
                              * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) 
                              for i in range(len(vals))])
    print('H(',split_attribute_name,')= ', round(Weighted_Entropy, 5))
    
    # 정보 이득 계산
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain
    

In [25]:
print('InfoGain( no_insects ) = ', round(InfoGain(data, "no_insects", "tree_health"), 5), '\n')
print('InfoGain( no_wilting ) = ', round(InfoGain(data, "no_wilting", "tree_health"), 5), '\n')
print('InfoGain( no_diseases ) = ', round(InfoGain(data, "no_diseases", "tree_health"), 5))

elements :  ['Good' 'Poor']
counts :  [6 4]
Entropy(D) =  0.97095
vals :  ['False' 'True']
counts W:  [2 8]
elements :  ['Good' 'Poor']
counts :  [1 1]
elements :  ['Good' 'Poor']
counts :  [5 3]
H( no_insects )=  0.96355
InfoGain( no_insects ) =  0.0074 

elements :  ['Good' 'Poor']
counts :  [6 4]
Entropy(D) =  0.97095
vals :  ['False' 'True']
counts W:  [1 9]
elements :  ['Poor']
counts :  [1]
elements :  ['Good' 'Poor']
counts :  [6 3]
H( no_wilting )=  0.82647
InfoGain( no_wilting ) =  0.14448 

elements :  ['Good' 'Poor']
counts :  [6 4]
Entropy(D) =  0.97095
vals :  ['False' 'True']
counts W:  [3 7]
elements :  ['Poor']
counts :  [3]
elements :  ['Good' 'Poor']
counts :  [6 1]
H( no_diseases )=  0.41417
InfoGain( no_diseases ) =  0.55678
