In [4]:
import pandas as pd
import numpy as np
import pprint

In [5]:
data = pd.read_csv('train.csv')

# frac - specifies 80%
# random_state - it saves the current state of the split. Used for reproducability
# Spliting data int Train-Validation[80-20]
train = data.sample(frac = 0.8, random_state = 200)
validation = data.drop(train.index)
categorical_train_data = train.iloc[:, 5:10]
categorical_validation_data = validation.iloc[:, 5:10]

In [6]:
def calc_entropy(table):
    target = table.keys()[-1] # target -> Label's column
    entropy = 0
    values = table[target].unique() # Finds the unique values in target variable, i.e. [0, 1]
    for value in values:
        '''
            total count of 0's         total count of 1's
            -------------------   ,    ---------------------  = frac
            total numbers of rows      total number of rows
        '''
        frac = table[target].value_counts()[value]/len(table[target])
        if frac == 0:
            return 0
        entropy += -frac*np.log2(frac)
    return(entropy) # returns entropy of whole dataset
print(calc_entropy(categorical_train_data))

0.7986105014037017


In [4]:
def calc_info_attribute(table, attribute):
    target = table.keys()[-1]
    variables = table[attribute].unique() # get unique variables in specified attribute
    target_variables = table[target].unique() # get unique in target, i.e. [0, 1]
    info = 0
    for v in variables:
        entropy = 0
        for target_v in target_variables:
            n = len(table[attribute][table[attribute] == v][table[target] == target_v])
            d = len(table[attribute][table[attribute] == v])
            frac1 = n/(d + np.finfo(float).eps) # eps - smallest representable number such that 1.0 + eps != 1.0
            if frac1 == 0:
                entropy += 0
            else:
                entropy += -frac1 * np.log2(frac1)
        frac2 = d/len(table)
        info += -frac2 * entropy
    return abs(info)

In [5]:
def winner_attribute(table, attribute_list):
    info_gain = []
#     print(attribute_list)
    if len(attribute_list) == 1:
        return attribute_list[0]
    for attribute in attribute_list:
        info_gain.append(calc_entropy(table) - calc_info_attribute(table, attribute))
    return attribute_list[np.argmax(info_gain)]

In [6]:
def build_tree(table, prev_table, attribute_list, tree = None):
    if len(table['left'].unique()) <= 1:
        return {'leaf' : table['left'].unique()[0]}
    elif len(table) == 0:
        return {'leaf': np.unique(table['left'])[np.argmax(np.unique(table['left'], return_counts = True)[1])]}
    elif len(attribute_list) == 0:
        return {'leaf': np.unique(table['left'])[np.argmax(np.unique(table['left'], return_counts = True)[1])]}
    node = winner_attribute(table, attribute_list)
    attribute_list.remove(node)
    if tree is None:
        tree = {}
        tree[node] = {}
    for v in table[node].unique():
        mod_table = table.where(table[node] == v).dropna()
        tree[node][v] = build_tree(mod_table, table, attribute_list[:])
    return tree

t = build_tree(categorical_train_data, categorical_train_data, ['Work_accident', 'promotion_last_5years', 'sales', 'salary'])
pprint.pprint(t)

{'salary': {'high': {'sales': {'IT': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0}}},
                                                        1.0: {'leaf': 0.0}}},
                               'RandD': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0},
                                                                                           1.0: {'leaf': 0.0}}},
                                                           1.0: {'leaf': 0.0}}},
                               'accounting': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0},
                                                                                                1.0: {'leaf': 0.0}}},
                                                                1.0: {'leaf': 0.0}}},
                               'hr': {'Work_accident': {0.0: {'promotion_last_5years': {0.0: {'leaf': 0.0}}},
                                                        1.0: {'leaf': 0.0}}},

In [7]:
def pred(query, tree):
    try:
        if list(tree.keys())[0] == 'leaf':
            return tree['leaf']
        val = query[list(tree.keys())[0]]
        return pred(query, tree[list(tree.keys())[0]][val])
    except:
        return 0

In [8]:
predicted = []
def prediction(tree, data):
    for index, row in data.iterrows():
        predicted.append(pred(row, tree))
prediction(t, categorical_validation_data)

In [9]:
def calc_metrics(predictions, actual):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(actual)):
        if actual[i] == predictions[i] and actual[i] == 1:
            tp += 1
        elif actual[i] == predictions[i] and actual[i] == 0:
            tn += 1
        elif actual[i] != predictions[i] and actual[i] == 0:
            fp += 1
        else:
            fn += 1
    accuracy = (tn + tp)/(tn + tp + fp + fn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    a = 1/precision
    b = 1/recall
    f1 = 2/(a + b)
    print('Accuracy - ', accuracy)
    print('Precision - ', precision)
    print('Recall - ', recall)
    print('F1 Score - ', f1)
    print('True Positive - ', tp)
    print('True Negative - ', tn)
    print('False Positive - ', fp)
    print('False Negative - ', fn)

target = categorical_validation_data['left'].tolist()

calc_metrics(predicted, target)

Accuracy -  0.7789145907473309
Precision -  1.0
Recall -  0.002008032128514056
F1 Score -  0.004008016032064128
True Positive -  1
True Negative -  1750
False Positive -  0
False Negative -  497


In [10]:
def make_bins(data):
    category_dict = {}
    label_dict = {}
    average_dict = {}
    for attribute in ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']:
        max_counts = []
        data.sort_values(attribute, inplace = True)
        
        for val in data[attribute].unique():
            t=data[data[attribute]==val]
            max_counts.append([val, np.unique(t['left'])[np.argmax(np.unique(t['left'], return_counts = True)[1])]])
        category_dict[attribute] = max_counts
#     print(category_dict)
    
    for key in category_dict.keys():
        label = 0
        label_list = []
        average_list = []
        for i in range(len(category_dict[key])-1):
            if category_dict[key][i][1] == category_dict[key][i+1][1]:
                label_list.append(label)
            else:
                print()
                label += 1
                label_list.append(label)
                average_list.append([(category_dict[key][i][0] + category_dict[key][i+1][0])/2, label])
        average_dict[key] = average_list
        label_dict[key] = label_list
#         break
#     print(label_dict)
    return average_dict
make_bins(train)




















{'average_montly_hours': [[126.5, 1],
  [132.5, 2],
  [274.5, 3],
  [279.5, 4],
  [280.5, 5],
  [282.5, 6],
  [287.5, 7]],
 'last_evaluation': [[0.445, 1], [0.475, 2], [0.995, 3]],
 'number_project': [[2.5, 1], [5.5, 2]],
 'satisfaction_level': [[0.11499999999999999, 1],
  [0.355, 2],
  [0.46499999999999997, 3]],
 'time_spend_company': [[4.5, 1], [5.5, 2]]}

In [11]:
def numerical_to_categorical(data):
    average_dict = make_bins(data)
    print (average_dict)
    for key in average_dict.keys():
        data.sort_values(key, inplace = True)
        new_column = []
        i = 0
        for index, row in data.iterrows():
            if row[key] <= average_dict[key][i][0]:
                new_column.append(average_dict[key][i][1]-1)
            elif i < len(average_dict[key])-1:
                i = i + 1
                new_column.append(average_dict[key][i][1]-1)
            else:
                new_column.append(average_dict[key][i][1])
        data.drop(key, axis = 1)
        column = pd.Series(new_column)
        data[key] = column.values
    return data, average_dict

In [12]:
def numerical_to_categorical_validation(data, average_dict):
    for key in average_dict.keys():
        data.sort_values(key, inplace = True)
        new_column = []
        i = 0
        for index, row in data.iterrows():
            if row[key] <= average_dict[key][i][0]:
                new_column.append(average_dict[key][i][1]-1)
            elif i < len(average_dict[key])-1:
                i = i + 1
                new_column.append(average_dict[key][i][1]-1)
            else:
                new_column.append(average_dict[key][i][1])
        data.drop(key, axis = 1)
        column = pd.Series(new_column)
        data[key] = column.values
    return data


In [13]:
discrete_train_data, average_dict = numerical_to_categorical(train)
print(discrete_train_data)
tree = build_tree(discrete_train_data, discrete_train_data, ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'sales', 'salary'])
# pprint.pprint(t)


















{'last_evaluation': [[0.445, 1], [0.475, 2], [0.995, 3]], 'satisfaction_level': [[0.11499999999999999, 1], [0.355, 2], [0.46499999999999997, 3]], 'number_project': [[2.5, 1], [5.5, 2]], 'average_montly_hours': [[126.5, 1], [132.5, 2], [274.5, 3], [279.5, 4], [280.5, 5], [282.5, 6], [287.5, 7]], 'time_spend_company': [[4.5, 1], [5.5, 2]]}
       satisfaction_level  last_evaluation  number_project  \
7482                    3                2               1   
9300                    3                2               1   
8585                    3                2               1   
5279                    3                1               1   
1506                    3                2               1   
1531                    3                2               0   
4216                    3                2               1   
11195                   3                2               1   
9598                    3                2               1   
1282                   

In [14]:
discrete_validation_data = numerical_to_categorical_validation(validation, average_dict)
print(discrete_validation_data)
predicted = []
prediction(tree, discrete_validation_data)
target = validation['left'].tolist()
calc_metrics(predicted, target)

       satisfaction_level  last_evaluation  number_project  \
3286                    3                2               1   
6924                    3                2               0   
9029                    3                2               1   
6847                    3                2               1   
8769                    3                2               1   
1790                    3                2               1   
9758                    3                2               0   
8919                    3                2               1   
2529                    3                2               0   
5381                    3                2               1   
10364                   3                2               1   
10586                   1                2               1   
659                     3                2               1   
7095                    3                2               1   
4879                    3                1               2   
2748    

Accuracy -  0.958185053380783
Precision -  0.8992094861660079
Recall -  0.9136546184738956
F1 Score -  0.9063745019920318
True Positive -  455
True Negative -  1699
False Positive -  51
False Negative -  43
