In [1]:
import pandas as pd
import numpy as np
import pprint

In [2]:
data = pd.read_csv('train.csv')
# print(data.iloc[:,0:2]) #iloc to index the columns

# frac - specifies 80%
# random_state - it saves the current state of the split. Used for reproducability
# Spliting data int Train-Validation[80-20]
train = data.sample(frac = 0.8, random_state = 200)
validation = data.drop(train.index)
categorical_train_data = train.iloc[:, 5:10]
categorical_validation_data = validation.iloc[:, 5:10]

In [3]:
def calc_entropy(table):
    target = table.keys()[-1] # target -> Label's column
    entropy = 0
    values = table[target].unique() # Finds the unique values in target variable, i.e. [0, 1]
    for value in values:
        '''
            total count of 0's         total count of 1's
            -------------------   ,    ---------------------  = frac
            total numbers of rows      total number of rows
        '''
        frac = table[target].value_counts()[value]/len(table[target])
        if frac == 0:
            return 0
        entropy += -frac*np.log2(frac)
    return(entropy) # returns entropy of whole dataset

In [4]:
def calc_info_attribute(table, attribute):
    target = table.keys()[-1]
    variables = table[attribute].unique() # get unique variables in specified attribute
    target_variables = table[target].unique() # get unique in target, i.e. [0, 1]
    info = 0
    for v in variables:
        entropy = 0
        for target_v in target_variables:
            n = len(table[attribute][table[attribute] == v][table[target] == target_v])
            d = len(table[attribute][table[attribute] == v])
            frac1 = n/(d + np.finfo(float).eps) # eps - smallest representable number such that 1.0 + eps != 1.0
            if frac1 == 0:
                entropy += 0
            else:
                entropy += -frac1 * np.log2(frac1)
        frac2 = d/len(table)
        info += -frac2 * entropy
    return abs(info)

In [5]:
def winner_attribute(table, attribute_list):
    info_gain = []
#     print(attribute_list)
    if len(attribute_list) == 1:
        return attribute_list[0]
    for attribute in attribute_list:
        info_gain.append(calc_entropy(table) - calc_info_attribute(table, attribute))
    return attribute_list[np.argmax(info_gain)]

In [6]:
def build_tree(table, prev_table, attribute_list, tree = None):
    if len(table['left'].unique()) <= 1:
        return {'leaf' : table['left'].unique()[0]}
    elif len(table) == 0:
        return {'leaf': np.unique(table['left'])[np.argmax(np.unique(table['left'], return_counts = True)[1])]}
    elif len(attribute_list) == 0:
        return {'leaf': np.unique(table['left'])[np.argmax(np.unique(table['left'], return_counts = True)[1])]}
    node = winner_attribute(table, attribute_list)
    attribute_list.remove(node)
    if tree is None:
        tree = {}
        tree[node] = {}
    for v in table[node].unique():
        mod_table = table.where(table[node] == v).dropna()
        tree[node][v] = build_tree(mod_table, table, attribute_list[:])
    return tree

t = build_tree(categorical_train_data, categorical_train_data, ['Work_accident', 'promotion_last_5years', 'sales', 'salary'])
pprint.pprint(t)

{'salary': {'high': {'sales': {'IT': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0}}},
                                                        1.0: {'leaf': 0.0}}},
                               'RandD': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0},
                                                                                           1.0: {'leaf': 0.0}}},
                                                           1.0: {'leaf': 0.0}}},
                               'accounting': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0},
                                                                                                1.0: {'leaf': 0.0}}},
                                                                1.0: {'leaf': 0.0}}},
                               'hr': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0}}},
                                                        1.0: {'leaf': 0.0}}},

In [7]:
def pred(query, tree):
    try:
        if list(tree.keys())[0] == 'leaf':
            return tree['leaf']
        val = query[list(tree.keys())[0]]
        return pred(query, tree[list(tree.keys())[0]][val])
    except:
        return 0

In [8]:
predicted = []
def prediction(tree, data):
    for index, row in data.iterrows():
        predicted.append(pred(row, tree))
prediction(t, categorical_validation_data)

In [12]:
def calc_metrics(predictions, actual):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(actual)):
        if actual[i] == predictions[i] and actual[i] == 1:
            tp += 1
        elif actual[i] == predictions[i] and actual[i] == 0:
            tn += 1
        elif actual[i] != predictions[i] and actual[i] == 0:
            fp += 1
        else:
            fn += 1
    accuracy = (tn + tp)/(tn + tp + fp + fn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    a = 1/precision
    b = 1/recall
    f1 = 2/(a + b)
    print('Accuracy - ', accuracy)
    print('Precision - ', precision)
    print('Recall - ', recall)
    print('F1 Score - ', f1)
    print('True Positive - ', tp)
    print('True Negative - ', tn)
    print('False Positive - ', fp)
    print('False Negative - ', fn)

target = categorical_validation_data['left'].tolist()

calc_metrics(predicted, target)

Accuracy -  0.7789145907473309
Precision -  1.0
Recall -  0.002008032128514056
F1 Score -  500.00000000000006
True Positive -  1
True Negative -  1750
False Positive -  0
False Negative -  497
