Not: Veri setini bu linkten indirebilirsiniz: https://www.kaggle.com/datasets/parisrohan/credit-score-classification

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ps = np.linspace(0.001,.9999,100)

def gini_index(ps):
    return 2* ps * (1-ps)

def misclass(ps):
    return 1 - np.maximum(ps,1-ps)

def entropy(ps):
    return -ps * np.log2(ps) - (1-ps)* np.log2(1-ps)

In [None]:
plt.plot(ps,gini_index(ps),label='gini-index')
plt.plot(ps,misclass(ps),label='misclass')
plt.plot(ps,entropy(ps),label='entropy')
plt.legend()
plt.show()

In [None]:
import pandas as pd 
train_df = pd.read_csv('train.csv')
len(train_df)

In [None]:
train_df = train_df.dropna()

In [None]:
len(train_df)

In [None]:
train_df.columns

In [None]:
dt_columns = ['Outstanding_Debt',
              'Monthly_Inhand_Salary',
              'Occupation',
              'Age',
               'Credit_Score']
y_column = 'Credit_Score'
train_df = train_df[dt_columns]

def convert_to_binary_classification(row):
    label = row['Credit_Score']
    
    if label == 'Poor':
        return label
    else: return "Good"

def convert_to_float(val)-> float:
    if type(val) == float:
        return round(val,3)
    val = val.strip('_')
    return round(float(val),3)

train_df['Credit_Score'] = train_df.apply(lambda row: convert_to_binary_classification(row), axis=1 )

for col in ['Outstanding_Debt','Monthly_Inhand_Salary','Age']:
    train_df[col] =  train_df.apply(lambda row: convert_to_float(row[col]), axis=1)

train_df['Age'] = train_df['Age'].clip(lower=18,upper=90) 


In [None]:
train_df['Age'].unique()

In [None]:
train_df.query("Credit_Score=='Poor'")

In [None]:
train_df['Outstanding_Debt'].iloc[0]

In [None]:
sample_size = 10000
viz_df = train_df.sample(n=sample_size)

In [None]:
# Visualize
from plotnine import *
from plotnine.data import *

ggplot(data=viz_df) + geom_point(mapping=aes(x='Monthly_Inhand_Salary',y='Age',color="Credit_Score"))+ facet_wrap('Occupation')

In [None]:
# se -> display confidence interval around smooth
# level -> confidence interval to use 0.95 by default

ggplot(data=viz_df) +\
geom_smooth(mapping=aes(x='Monthly_Inhand_Salary',
                        y='Outstanding_Debt',
                        color="Credit_Score",
                        ),se=True,level=0.99) 

In [None]:
# use global mapping withing ggplot to avoid duplicate mention of same columns
ggplot(data=viz_df,mapping=aes(x='Monthly_Inhand_Salary',y='Outstanding_Debt')) +\
geom_boxplot(mapping=aes(color='Credit_Score'))

## Decision Tree from Scratch

In [None]:
sample_df = train_df.sample(n=100)

In [None]:
ggplot(data=sample_df) +\
geom_point(mapping=aes(x='Monthly_Inhand_Salary',y='Age',color="Credit_Score"))

In [None]:
# binary classification 
from typing import Literal, Tuple


def node_entropy(node_df: pd.DataFrame) -> float:
    count = len(node_df)
    class_labels = node_df[y_column].unique()

    impurity = 0

    for c in class_labels:
        pc = len(node_df[node_df[y_column]==c])/count
        if pc > 0 :
            impurity -= pc * np.log2(pc)
    
    return impurity


def categorical_split(node_df: pd.DataFrame,col_name:str) -> Tuple[float,None]:
    all_categories = list(node_df[col_name].unique())
    total_size = len(node_df)

    total_impurity = 0
    for c in all_categories:
        
        split_df = node_df[node_df[col_name]==c]
        w = len(split_df)/total_size
        imp = node_entropy(split_df)
        print(c,imp,w)
        total_impurity+= w * imp

    return total_impurity, None

def binary_split(node_df:pd.DataFrame,col_name:str) -> Tuple[float,float]:
    all_values = list(set(node_df[col_name]))
    all_values.sort()
    min_impurity = float('inf')
    best_split_val = None
    n = len(node_df)
    for i,val in enumerate(all_values):
        left = node_df[node_df[col_name]<=val]
        right = node_df[node_df[col_name]>val]
        impurity = (node_entropy(left) * len(left) + node_entropy(right) * len(right)) / n
        if impurity < min_impurity:
            min_impurity = impurity
            best_split_val = val
    return min_impurity, best_split_val


In [None]:
node_entropy(sample_df)

In [None]:
split1= sample_df[sample_df['Monthly_Inhand_Salary']<=7500]
split2= sample_df[sample_df['Monthly_Inhand_Salary']>7500]
(node_entropy(split1) * len(split1) + node_entropy(split2) *  len(split2)) /  len(sample_df)

In [None]:
split1.columns

In [None]:
ggplot(data=split1) +\
geom_point(mapping=aes(x='Monthly_Inhand_Salary',y='Outstanding_Debt',color="Credit_Score"))

In [None]:
node_entropy(split1)

In [None]:
split11= split1[split1['Outstanding_Debt']<=2000]
split12= split1[split1['Outstanding_Debt']>2000]
(node_entropy(split11) * len(split11) + node_entropy(split12) *  len(split12)) /  len(split1)

In [None]:
node_entropy(split11)

In [None]:
node_entropy(split12)

In [None]:
ggplot(data=split11) +\
geom_point(mapping=aes(x='Age',y='Outstanding_Debt',color="Credit_Score")) 

In [None]:
sample_df[sample_df['Occupation']=='Doctor']['Credit_Score'].value_counts()

In [None]:
categorical_split(sample_df,'Occupation')

In [None]:
node_entropy(sample_df)

In [None]:
split1 = sample_df.query('Outstanding_Debt>2000')
split2 = sample_df.query('Outstanding_Debt<=2000')

In [None]:
(node_entropy(split1) * len(split1) + node_entropy(split2) *  len(split2)) /  len(sample_df)