In [41]:
import pandas as pd
import numpy as np

# Gini Impurity Calculation
def gini_impurity(yes_count, total_count):
    if total_count == 0:
        return 0
    p1 = yes_count / total_count
    p2 = (total_count - yes_count) / total_count
    gini = 1 - (p1 ** 2) - (p2 ** 2)
    return gini

# Calculate Gini Impurity for a given feature
def calculate_gini(df, feature, target='salary_more_then_100k'):
    gini = 0
    total_count = len(df)
    unique_values = df[feature].unique()
    # print(unique_values)
    for value in unique_values:
        subset = df[df[feature] == value]
        # print(subset)
        yes_count = len(subset[subset[target]==1])
        # print(yes_count)
        gini += (len(subset) / total_count) * gini_impurity(yes_count, len(subset))
    return gini

# Split a DataFrame based on a given feature and value
def split_data(df, feature, value):
    return df[df[feature] == value]

# Node class for Decision Tree
class Node:
    def __init__(self, feature=None, value=None, result=None):
        self.feature = feature
        self.value = value
        self.result = result
        self.children = {}

# Decision Tree Algorithm
def decision_tree(df, features):
    if len(set(df['salary_more_then_100k'])) == 1:
        return Node(result=df['salary_more_then_100k'].iloc[0])

    best_gini = float('inf')
    best_feature = None
    if features:
        for feature in features:
            gini = calculate_gini(df, feature)
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
    if best_feature is None:
        # Handle the case when best_feature is None
        return Node(result=df['salary_more_then_100k'].mode()[0])
        
    node = Node(feature=best_feature)

    unique_values = df[best_feature].unique()
    
    for value in unique_values:
        subset = split_data(df, best_feature, value)
        # print(subset)
        if len(subset) == 0:
            node.children[value] = Node(result=df['salary_more_then_100k'].mode()[0])
        else:
            remaining_features = features.copy()
            remaining_features.remove(best_feature)
            node.children[value] = decision_tree(subset.drop(columns=[best_feature]), remaining_features)

    return node

df = pd.read_csv("salaries.csv")
features = ['company', 'job', 'degree']
# calculate_gini(df,'degree')
tree=decision_tree(df,features)

def classify(tree, sample):
    if tree.result is not None:
        return tree.result

    value = sample[tree.feature]
    if value not in tree.children:
        return sample['salary_more_then_100k'].mode()[0]

    return classify(tree.children[value], sample)

sample_data = {'company': 'google', 'job': 'business manager', 'degree': 'masters'}
result = classify(tree, sample_data)
print("Predicted salary_more_then_100k:", result)

Predicted salary_more_then_100k: 1
