In [8]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [9]:

import torch

from data import get_data

In [10]:
# df = pd.DataFrame([[1,True,'Yes'],[2,True,'Yes'],[3,False,'No'],[4,True,'No'],[5,False,'Yes'],
#                    [6,False,'No'],[7,True,'Yes'],[8,False,'No'],[9,True,'No'],[10,True,'Yes'],
#                    ],
#                    columns=['num','income_over_1000','tv_at_home'])
# df.head()
# goal_label = 'tv_at_home'

In [11]:
# df = get_data('drug200.csv')
# goal_label = 'Drug'

In [13]:
df = get_data('titanic/train.csv')
df = df.drop(labels=['Name','Ticket'], axis=1)
goal_label = 'Survived'


In [14]:
len(df)

891

In [15]:
df.dropna()
len(df)

891

In [16]:
unique_vals = {}
for col in df.columns:
    if col == 'id':
        continue
    col_df = df[col]
    unique_vals[col] = col_df.unique()

In [17]:
unique_vals.keys()

dict_keys(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'])

In [18]:
input_data = df[[key for key in unique_vals.keys() if key != goal_label]].copy()
labels = df[[goal_label]].copy

In [19]:
input_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.2500,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.9250,,S
3,4,1,female,35.0,1,0,53.1000,C123,S
4,5,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,,S
887,888,1,female,19.0,0,0,30.0000,B42,S
888,889,3,female,,1,2,23.4500,,S
889,890,1,male,26.0,0,0,30.0000,C148,C


In [20]:
labels()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [21]:
class_to_idx = {}
idx_to_class = {}
for key in unique_vals:
    if isinstance(unique_vals[key][0], str):
        num_classes = len(unique_vals[key])
        class_to_idx[key] = {unique_vals[key][idx]:idx for idx in range(num_classes)}
        idx_to_class[key] = {idx: unique_vals[key][idx] for idx in range(num_classes)}
        # one_hot = torch.nn.functional.one_hot(torch.arange(0,num_classes)).float()
        # one_hot_encodings[key] = one_hot
class_to_idx

{'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2, nan: 3}}

In [22]:
idx_to_class

{'Sex': {0: 'male', 1: 'female'}, 'Embarked': {0: 'S', 1: 'C', 2: 'Q', 3: nan}}

In [23]:
for key in class_to_idx:
    df[key] = df[key].map(class_to_idx[key])
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,0,22.0,1,0,7.2500,,0
1,2,1,1,1,38.0,1,0,71.2833,C85,1
2,3,1,3,1,26.0,0,0,7.9250,,0
3,4,1,1,1,35.0,1,0,53.1000,C123,0
4,5,0,3,0,35.0,0,0,8.0500,,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.0,0,0,13.0000,,0
887,888,1,1,1,19.0,0,0,30.0000,B42,0
888,889,0,3,1,,1,2,23.4500,,0
889,890,1,1,0,26.0,0,0,30.0000,C148,1


In [24]:
### Information Gain - Classification

def entropy(probs):
    entropy = 0.
    for prob in probs:
        entropy -= (prob)*np.log2(prob) if prob > 0 else 0.
    return entropy


def compute_probs(values: list,
                  tar_vals: list,
                  df: pd.DataFrame,
                  col_idx: str,
                  tar_idx: str,
                  ):
    counts = []
    for val in values:
        tar_counts = []
        for tar_val in tar_vals:
            tar_counts.append(len(df[(df[col_idx]==val) & (df[tar_idx]==tar_val)]))
        counts.append(tar_counts)
    probs = []
    for val_count in counts:
        probs.append([v / sum(val_count) for v in val_count])
    return probs, counts


def bucket_probs(tar_values: list,
                 df: pd.DataFrame,
                 col_idx: str,
                 num_buckets: int = 2,
                 ):
    # num_buckets == 2, average split
    data = df.dropna(subset=[col_idx])
    counts = []
    if num_buckets == 2:
        split_vals = [data[col_idx].min(),data[col_idx].mean()] # ,data[col_idx].max()
    else:
        split_vals = [] # split into n buckets
    for val0,val1 in zip(split_vals[:-1],split_vals[1:]):
        tar_counts = []
        for val in tar_values:
            tar_counts.append(len(data[data[col_idx].between(val0,val1)==val]))
        counts.append(tar_counts)
    probs = []
    for val_count in counts:
        probs.append([v / sum(val_count) for v in val_count])
    return probs, counts, split_vals


def information_gain(df: pd.DataFrame,
                     split_idx: int,
                     tar_idx: int,
                     max_split_nodes: int = 5,
                     ):
    data_num = df[tar_idx].count()
    # parent entropy
    tar_vals = df[tar_idx].unique()
    if len(tar_vals) <= max_split_nodes:
        counts = []
        for val in tar_vals:
            tar_probs = counts.append(len(df[df[tar_idx]==val]))
        tar_probs = counts / data_num
    else: # for regression, we can take the average or bucket  
        pass
    parent_entropy = entropy(tar_probs) # find int to column map in pandas
    # compute split entropies
    split_vals = df[split_idx].unique()
    if len(split_vals) <= max_split_nodes:
        # print(f'raw count for {split_idx}...')
        split_probs, split_counts = compute_probs(values=split_vals,
                                                  tar_vals=tar_vals,  
                                                  df=df, 
                                                  col_idx=split_idx,
                                                  tar_idx=tar_idx,
                                                  )
    else: # for regression, we can take the average or bucket for 'val'
        # print(f'bucket count for {split_idx}...')
        split_probs, split_counts, split_vals = bucket_probs(tar_values=tar_vals,
                                                             df=df,
                                                             col_idx=split_idx,
                                                             num_buckets=2,
                                                             )
    split_entropy = 0.
    for idx, prob in enumerate(split_probs):
        split_entropy += entropy(prob) * (sum(split_counts[idx]) / data_num)
    # information gain
    # print(f'#####\nparent entropy: {parent_entropy}\nsplit entropy: {split_entropy}')
    return parent_entropy - split_entropy, split_vals

In [93]:
def best_split(df: pd.DataFrame,
               goal_label: str,
               ):
    best_col = ''
    best_ig = 0.
    best_split_vals = []
    for col in df.columns:
        if col not in [goal_label,'PassengerId','Cabin']:
            ig, split_vals = information_gain(df=df, split_idx=col, tar_idx=goal_label)
            if ig > best_ig:
                best_ig = ig
                best_col = col
                best_split_vals = split_vals
            # print(f"{col+' information gain:':35} {ig:.5f}")
    return best_col, np.sort(best_split_vals)

In [94]:
### Gini Index

def gini(probs):
    gini = 0.
    for prob in probs:
        gini -= prob**2
    return 1-gini



In [95]:
class Node:
    def __init__(self, 
                 node_idxs: list,
                 prev_depth: int = 0,
                 split_idx: int = 0,
                 split_vals: tuple = None,
                 is_terminal: bool = False,
                 ):
        self.split_idx = split_idx
        self.split_vals = split_vals
        self.node_idxs = node_idxs
        self.children = []
        self.depth = prev_depth+1
        self.is_terminal = is_terminal


In [102]:
class DecisionTree:
    def __init__(self,
                 data: pd.DataFrame,
                 goal_label: str, 
                 max_depth: int = 5,
                 min_feats: int = 10,
                 num_class: int = 0,
                 ):
        all_idxs = np.array(range(len(data)))
        self.root = Node(node_idxs=all_idxs)
        self.max_depth = max_depth
        
        self.min_feats = min_feats
        self.num_class = num_class

        self.goal_label = goal_label

        self.grow_tree(data=data,
                       )
    
    def grow_tree(self, 
                  data: pd.DataFrame,
                  ):
        # node_queue = [self.root]
        # while len(node_queue) > 0:
        self.split_node(node=self.root,
                        data=data,
                        )
        

    def split_node(self,
                   node: Node,
                   data: pd.DataFrame,
                   ):
        # print(node.split_idx, len(node.node_idxs))
        
        curr_node = node if node else self.root
        if curr_node.depth >= self.max_depth:
            curr_node.is_terminal = True
            # print('Reached maximum depth!')
            return
        
        if len(curr_node.node_idxs) < self.min_feats:
            curr_node.is_terminal = True
            # print('Not enough data features to split node!')
            return
            
        node_data = data.iloc[curr_node.node_idxs]
        
        split_col, split_vals = best_split(df=node_data,
                                           goal_label=self.goal_label,
                                           )
        
        # print(split_col, split_vals)
        for val0,val1 in zip(split_vals[:],np.append(split_vals[1:],split_vals[-1]+1e-6)):
            data_idxs = data[data[split_col].between(val0,val1,inclusive='left')].index.to_numpy()
            child = Node(split_idx=split_col,
                         split_vals=(val0,val1),
                         prev_depth=curr_node.depth,
                         node_idxs=data_idxs,
                         )
            curr_node.children.append(child)
            
            # child_data = data.iloc[data_idxs]
            self.split_node(child,
                            data.drop([split_col], axis=1), #child_data,
                            )
            # print('next child...')
        
    def enumerate(self):
        nodes = [ self.root ]
        while len(nodes) > 0:
            node = nodes.pop(0)
            # print(f"{' '*node.depth}[{node.split_idx}] - [{node.split_vals}]")
            if not node.is_terminal:
                print(f"{' '*node.depth}[{node.split_idx}] - [{node.split_vals}]")
                for child in node.children[::-1]:
                    nodes.insert(0,child)
            else:
                print(f"{' '*node.depth}[{node.split_idx}] (terminal) - [{node.split_vals}]")


In [107]:
dt = DecisionTree(data=df,
                  goal_label=goal_label,
                  max_depth=4,
                  min_feats=50,
                  )

In [108]:
dt.enumerate()

 [0] - [None]
  [Sex] - [(0, 1.0)]
   [Parch] - [(0.0, 0.23570190641247835)]
    [Age] (terminal) - [(5.0, 32.17850287907869)]
    [Age] (terminal) - [(32.17850287907869, 32.17850387907869)]
   [Parch] (terminal) - [(0.23570190641247835, 0.23570290641247835)]
  [Sex] - [(1, 1.000001)]
   [Pclass] - [(1, 2.0)]
    [Age] (terminal) - [(0.92, 38.233440860215055)]
    [Age] (terminal) - [(38.233440860215055, 38.23344186021505)]
   [Pclass] - [(2, 3.0)]
    [Parch] (terminal) - [(0, 1.0)]
    [Parch] (terminal) - [(1, 2.0)]
    [Parch] (terminal) - [(2, 3.0)]
    [Parch] (terminal) - [(3, 3.000001)]
   [Pclass] - [(3, 3.000001)]
    [Age] (terminal) - [(0.42, 25.14061971830986)]
    [Age] (terminal) - [(25.14061971830986, 25.14062071830986)]


In [56]:
np.append(np.array([1,2,3]),4)

array([1, 2, 3, 4])

In [None]:
from sklearn import tree
classifier = tree.DecisionTreeClassifier()

In [None]:
X = df[['Age','Sex','BP','Cholesterol','Na_to_K']]
Y = df[['Drug']]

In [None]:
classifier = classifier.fit(X,Y)

In [None]:
[idx_to_class['Drug'][idx] for idx in classifier.predict(X.iloc[0:3])]

['DrugY', 'drugC', 'drugC']

In [None]:
labels().iloc[0:3]

Unnamed: 0,Drug
0,DrugY
1,drugC
2,drugC


In [None]:
idx_to_class

{'Sex': {0: 'F', 1: 'M'},
 'BP': {0: 'HIGH', 1: 'LOW', 2: 'NORMAL'},
 'Cholesterol': {0: 'HIGH', 1: 'NORMAL'},
 'Drug': {0: 'DrugY', 1: 'drugC', 2: 'drugX', 3: 'drugA', 4: 'drugB'}}