In [33]:
import pandas as pd
import numpy as np

file = input("Enter file name: ")

df = pd.read_csv(file) # obviously change this if file is not a csv

def calc_gini_S(df):
    
    """
    pass df to calculate gini
    labels need to be last column
    no index in columns, must be index
    gini(S) = 1- sum(from l to L)(f^2)
    f is the relative freq of the class within S
    L = total types
    l is class within S
    """
    
    label_col = df.columns[-1]
    val_counts = df[label_col].value_counts()
#     print(val_counts)
    # get L, total number of classes
    L = len(val_counts)
    labels = val_counts.index
    # get the count of instances
    N = np.sum(val_counts, axis = 0)
#     print(N)
    labs_dict = {}
#     print(val_counts.loc['Yes'])
#     get labels and counts to dict
    for label in labels:
#         print(label)
        labs_dict[label] = val_counts.loc[label]
    # get each frequency and square it, add to list for summing later
    freq_lst = []
    for k, v in labs_dict.items():
#         print(k,v)
        f_of_class_sq = (v/N)**2
        freq_lst.append(f_of_class_sq)
#     print(freq_lst)
#     print(np.sum(freq_lst))
#     print(labs_dict)
#     gini - 1 - sum(freq_of_class^2)
    gini = 1 - np.sum(freq_lst)
    
    return gini
  
  
def calc_gini_index(feature, df = df):
    
    """
    pass df to calculate gini
    labels need to be last column
    no index in columns, must be index
    gini(S) = 1- sum(from l to L)(f^2)
    f is the relative freq of the class within S
    L = total types
    l is class within S
    """
    
    # get the label and counts of label for each group within class
    label_col = df.columns[-1]
    label_val_counts = df[label_col].value_counts()
    labs = label_val_counts.index
#     print(f"Counts of feature with label:\n{df.groupby(feature)[label_col].value_counts()}")

    # get value counts for each group within class
    val_counts = df[feature].value_counts()    
#     print(val_counts)

    # get L, total number of classes
    L = len(val_counts)
    groups = val_counts.index
#     print(groups)
    
    # get the count of instances
    N = np.sum(val_counts, axis = 0)
#     print(N)
    groups_dict = {}
#     print(val_counts.loc['Yes'])
#     get labels and counts to dict
    for group in groups:
#         print(label)
        groups_dict[group] = {'total': val_counts.loc[group]} # first item in list, append to later
#     print(groups_dict)
    
    # gets label counts for groups within class
    for k, v in groups_dict.items():
#         print(k,v)
        sub_dict = {}
        for lab in labs:
#             print(groups_dict[k])
            sub_dict[lab] = len(df[(df[feature] == k) & (df[label_col] == lab)])
            groups_dict[k].update(sub_dict)
#     print(groups_dict)

# now to get gini_index of each group based on the split (group within the class)
    for k,v in groups_dict.items():
#         print(k,v)
        yes = v['Yes']
        no = v['No']
        sub_group_total = v['total']
#         print(v['Yes'])
        p1 = yes/sub_group_total
        p2 = no/sub_group_total
#         print(p1, p2)
        prob_lst_sq = [p1**2, p2**2]
        gini_index = {"gini_index": (1 - np.sum(prob_lst_sq))}
#         print(gini_index)
        groups_dict[k].update(gini_index)        

    # get the gini index for whole feature
    # proportion of subgroup to whole df * gini of subgroup summed for all subgroups within feature
    feature_gini_index = 0
    for k,v in groups_dict.items():
#         print(v['total']/N)
#         print(v['gini_index'])
        subgrp_gini_idx = ((v['total']/N) * v['gini_index'])
#         print(subgrp_gini_idx)
        feature_gini_index += subgrp_gini_idx
#     groups_dict['feature_gini_index'] = feature_gini_index
#     print(groups_dict)
    
    return groups_dict, feature_gini_index

    
def gini_split(feature, df = df):
    
    """
    gini_split (S, T) = (N1/N) * gini(S1) + (N2/N) * gini(S2)
    T = splitting criteria (a feature)
    split S into 2 subsets s1 and s2 with sizes n1 and n2 respectively
    calc gini index for each set (1 and 2)
    use percent of each set to apply like a weight
    quantify gini index of each subset
    will be numeric value (ex: x1 = 5, s1 < 5, s2 >= 5)
    """

    N = df.shape[0]
    gini_lst = []
    gini_index = calc_gini_index(feature, df = df)[0]
    for k,v in gini_index.items():
#         print(k)
        sub_group_total = v['total']
        p = sub_group_total/N
        gini_index = v['gini_index']
        gini_lst.append(p * gini_index)
    gini_split = np.sum(gini_lst)
    return gini_split
  
def gini_gain(S, split):
    """
    gini(S) - gini_index
    """
    
    return S-split
  
def main(df):
    S = calc_gini_S(df)
    print(f"Gini Index (S) {df.columns[-1]}:\n{S}\n")
    
    # create dicts for sorting
    gini_idx_d = {}
    gini_spl_d = {}
    gini_gai_d = {}
    
    for i in df.columns[:-1]:
        gini_idx = calc_gini_index(i, df = df)
        gini_index = gini_idx[0]
        feature_gini_idx = gini_idx[1]
        print(f"Gini Index for {i}: {feature_gini_idx}\n")
        gini_idx_d[i] = feature_gini_idx
        for k,v in gini_index.items():
            print(f"Gini Index Info for {i} {k}\n{k,v}\n")
        split = gini_split(i, df = df)
        gain = gini_gain(S, split)
        gini_spl_d[i] = split
        gini_gai_d[i] = gain
        print(f"\nGini Split for {i}\n{split}\n")
        print(f"Gini Gain for {i}\n{gain}")
        print("--------------------\n")
    print("\n\nSorted Results\n")
    print("---Gini Index---")
    gi = {k:v for k,v in sorted(gini_idx_d.items(), reverse = True, key = lambda item: item[1])}
    for k,v in gi.items():
        print(k, v)
        
    print("\n---Gini Split---")
    gs = {k:v for k,v in sorted(gini_spl_d.items(), reverse = True, key = lambda item: item[1])}
    for k,v in gs.items():
        print(k, v)
        
    print("\n---Gini Gain---")
    gg = {k:v for k,v in sorted(gini_gai_d.items(), reverse = True, key = lambda item: item[1])}
    for k,v in gg.items():
        print(k, v)
    
main(df)

Enter file name: play_tennis.csv
Gini Index (S) Play Tennis:
0.4591836734693877

Gini Index for Day: 0.0

Gini Index Info for Day 1
(1, {'total': 1, 'Yes': 0, 'No': 1, 'gini_index': 0.0})

Gini Index Info for Day 2
(2, {'total': 1, 'Yes': 0, 'No': 1, 'gini_index': 0.0})

Gini Index Info for Day 3
(3, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 4
(4, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 5
(5, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 6
(6, {'total': 1, 'Yes': 0, 'No': 1, 'gini_index': 0.0})

Gini Index Info for Day 7
(7, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 8
(8, {'total': 1, 'Yes': 0, 'No': 1, 'gini_index': 0.0})

Gini Index Info for Day 9
(9, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 10
(10, {'total': 1, 'Yes': 1, 'No': 0, 'gini_index': 0.0})

Gini Index Info for Day 11
(11, {'total': 1, 'Yes': 1, 'No': 0

In [None]:
# play_tennis.csv