In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
training_data = {
                'home_owner': ['yes', 'no', 'no', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no'], 
                'marital_status': ['Single', 'Married', 'Single', 'Married', 'Married', 'Married', 'Married', 'Single', 
                                  'Married', 'Single'],
                 'annual_income' : [125, 100, 70, 120, 95, 60, 220, 85, 75, 90],
                'default': ['no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes', 'no', 'yes']
                }
df = pd.DataFrame(training_data)
df['annual_income'] = df['annual_income'].astype(int)
df

Unnamed: 0,home_owner,marital_status,annual_income,default
0,yes,Single,125,no
1,no,Married,100,no
2,no,Single,70,no
3,yes,Married,120,no
4,no,Married,95,yes
5,no,Married,60,no
6,yes,Married,220,no
7,no,Single,85,yes
8,no,Married,75,no
9,no,Single,90,yes


In [3]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float) or isinstance(value, np.int64)

is_numeric(1), is_numeric('hello'), is_numeric(1.0)

(True, False, True)

In [4]:
class Question:
    def __init__(self, col_name, col_value):
        self.col_name = col_name
        self.col_value = col_value
        
    def match(self, row):
        new_col_val = row[self.col_name]
#         print(new_col_val)
        if is_numeric(new_col_val):
            return new_col_val <= self.col_value
        else:
            return new_col_val == self.col_value
        
    def to_string(self):   
        question_str = (self.col_name + " <= " + str(self.col_value)) if is_numeric(self.col_value)  else (self.col_name + " == " + self.col_value)
        return question_str
        
        
income = Question('annual_income', 80) 
print('False == ', income.match(df.iloc[0]))
print('True == ', income.match(df.iloc[2]))
print(income.to_string())
marital_status = Question('marital_status', 'Single') 
print('True == ', marital_status.match(df.iloc[0]))
print('False == ', marital_status.match(df.iloc[1]))
print(marital_status.to_string())

False ==  False
True ==  True
annual_income <= 80
True ==  True
False ==  False
marital_status == Single


In [5]:
'''
Partition the data based on test condition given as question
'''
def partition(rows, question):

    true_rows, false_rows = [], []
        
    for i in range(0, len(rows)):
        row = df.iloc[i]

        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
            
    return(true_rows, false_rows)   

## Test Case 
true_rows, false_rows = partition(df, income)

print(pd.DataFrame(true_rows))
pd.DataFrame(false_rows)

  home_owner marital_status  annual_income default
2         no         Single             70      no
5         no        Married             60      no
8         no        Married             75      no


Unnamed: 0,home_owner,marital_status,annual_income,default
0,yes,Single,125,no
1,no,Married,100,no
3,yes,Married,120,no
4,no,Married,95,yes
6,yes,Married,220,no
7,no,Single,85,yes
9,no,Single,90,yes


In [6]:

    '''
        gini = 1 - frequency(class_0)^2 - frequency(class_1)^2
        
        gini values are between 0 and 1; 
            >> 0 means all records belong to the same class [This is the desired]        

        Assumptions: 
            >> method assumes that label default is the target and has values 'yes | no'     
            
    '''
    def gini_index(df):
        gini = 0
        
        total_records = len(df)

        if total_records > 0:
            records_class_0 = len (df[df['default'] == 'no'])
            records_class_1 =len (df[df['default'] == 'yes'])

            gini = 1 - (records_class_0/total_records)**2 - (records_class_1/total_records)**2
        
        return gini
    
    
    '''
        Determine Quality of Split based on gini 
        Assumptions: 
            >> Binary splits 
    '''
    def gini_information_gain(parent_df, child_1_df, child_2_df):
        
        parent_gini = gini_index(parent_df)
        
        child_1_gini = gini_index(child_1_df)
        
        child_2_gini = gini_index(child_2_df)
        
        gain = parent_gini - (len(child_1_df)/len(parent_df)) * child_1_gini - (len(child_2_df)/len(parent_df)) * child_2_gini
        
        return gain  

In [7]:
'''Test gini_index method'''
gini_index(df)

0.42000000000000004

In [8]:
'''Test gini_gain method'''

child_1 = df[0:4]
child_2 = df[4: ]


print('child_1 gini', gini_index(child_1))
print('child_2 gini', gini_index(child_2))
print('parent gini', gini_index(df))
print('Gini Gain', gini_information_gain(df, child_1, child_2))

child_1 gini 0.0
child_2 gini 0.5
parent gini 0.42000000000000004
Gini Gain 0.12000000000000005


In [24]:
'''
    How do you find the best questions? For now, take the unique values of all columns for string check for equality 
    and for numerics check for <= col value
'''

def find_best_split(df):
    
    best_gain = 1
    best_col = None
    best_question = None
    yes_df = no_df = None

    cols = [col for col in df.columns if col != 'default']
    
    for col in cols:
        unique_vals = df[col].unique()

        for val in unique_vals:
            col_question = Question(col, val)

            true_rows, false_rows = partition(df, col_question)

            y_df = pd.DataFrame(true_rows)
            n_df = pd.DataFrame(false_rows)

                
            split_gain = gini_information_gain(df, y_df, n_df)

            if split_gain != 0 and split_gain <= best_gain:
                best_gain = split_gain
                best_col = col
                best_question = col_question
                yes_df = y_df
                no_df = n_df
        
    return  yes_df, no_df, best_question   
    
    
    
    

In [25]:
yes_df, no_df, best_question  = find_best_split(df)
print(best_question.to_string())
print(gini_information_gain(df, yes_df,no_df))

annual_income <= 85


Unnamed: 0,home_owner,marital_status,annual_income,default
2,no,Single,70,no
5,no,Married,60,no
7,no,Single,85,yes
8,no,Married,75,no


In [14]:
class MyDecisionTree:
      
    
    def create_node():
        pass

    def tree():
        pass
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
        
    def predict(self, X_test):
        predictions = []
        
        return predictions
    
    

In [15]:
myDT = MyDecisionTree()

In [16]:
# iris = load_iris()

# X = iris.data
# y = iris.target

X = df.iloc[:, 0:3]
y = df.iloc[:, -1]

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

(10, 3) (10,)


In [17]:
myDT = MyDecisionTree()

myDT.fit(X_train, y_train)

y_pred = myDT.predict(X_test)