In [1]:
from tqdm import tqdm

In [2]:
import pandas as pd
final_data = pd.read_csv('Final_data.csv')

In [3]:
X = final_data.drop('loan_status', axis = 1)
y = final_data['loan_status']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
import numpy as np
class XGBoostClassifier:
    def __init__(self, n_estimators=3, max_depth=3, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.estimators = []

    def _softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return e_x / np.sum(e_x, axis=1, keepdims=True)

    def _log_loss_grad(self, y_true, y_pred):
        return y_pred - self.y_encoded  # Menggunakan y_encoded sebagai target prediksi
    
    def _log_loss_hess(self, y_true, y_pred):
        p = self._softmax(y_pred)
        return p * (1 - p)
    
    def _split_data(self, X, y, feature_idx, threshold):
        mask = X[:, feature_idx] < threshold
        X_left = X[mask]
        y_left = y[mask]
        X_right = X[~mask]
        y_right = y[~mask]
        return X_left, y_left, X_right, y_right

    def _gain(self, gradients, hessians):
        gain = 0.5 * (np.sum(np.square(gradients)) /
                     (np.sum(hessians) + 1e-8))
        return gain

    def _build_tree(self, X, y, depth):
        if depth >= self.max_depth:
            return None

        n_samples, n_features = X.shape
        n_classes = len(set(y))
        best_gain = -1
        best_feature_idx = None
        best_threshold = None
        best_X_left, best_y_left, best_X_right, best_y_right = None, None, None, None
        X = X.to_numpy()

        # Mendapatkan gradien dan hessian awal
        gradients = self._log_loss_grad(y, self._softmax(np.zeros((n_samples, self.n_classes))))
        hessians = self._log_loss_hess(y, self._softmax(np.zeros((n_samples, self.n_classes))))

        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self._split_data(X, y, feature_idx, threshold)
                left_indices = np.where(X_left)[0]
                right_indices = np.where(X_right)[0]
                left_gradients = gradients[left_indices]
                left_hessians = hessians[left_indices]
                right_gradients = gradients[right_indices]
                right_hessians = hessians[right_indices]
                
                def _split_gain(gradients_left, hessians_left, gradients_right, hessians_right, gradients, hessians): 
                    gain = (np.sum(gradients_left) ** 2 / (np.sum(hessians_left) + 1)) + (np.sum(gradients_right) ** 2 / (np.sum(hessians_right) + 1)) + (np.sum(gradients) ** 2 / (np.sum(hessians) + 1))
                    return gain               
                gain = _split_gain(left_gradients, left_hessians, right_gradients, right_hessians, gradients, hessians)
                
                if np.max(gain) > best_gain:
                    best_gain = np.max(gain)
                    best_feature_idx = feature_idx
                    best_threshold = threshold
                    best_X_left, best_y_left, best_X_right, best_y_right = X_left, y_left, X_right, y_right
        return {
            'feature_idx': self.best_feature_idx,
            'threshold': self.best_threshold,
            'left': self._build_tree(self.best_X_left, self.best_y_left, depth + 1),
            'right': self._build_tree(self.best_X_right, self.best_y_right, depth + 1),
            'depth': depth
        }
    
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        n_samples, n_features = X.shape   
         # One-hot encoding label target
        self.n_classes = len(set(y))
        self.y_encoded = np.eye(self.n_classes)[y]
        for _ in tqdm(range(self.n_estimators), desc="Training progress"):
            tree = self._build_tree(X, y, depth=0)
            self.estimators.append(tree)1

            # Dapatkan prediksi dari semua pohon sejauh ini
            predictions = self.predict_proba(X)

            # Hitung gradien dan hessian menggunakan log loss
            gradients = self._log_loss_grad(y, predictions)
            hessians = self._log_loss_hess(y, predictions)

            # Update prediksi dengan gradien menggunakan learning rate
            predictions -= self.learning_rate * gradients

        return self

    def predict_proba(self, X):
        n_samples = X.shape[0]
        n_classes = self.n_classes

        # Inisialisasi matriks probabilitas prediksi dengan nol
        proba = np.zeros((n_samples, n_classes))

        # Prediksi dari setiap pohon
        for tree in self.estimators:
            node = tree
            while node:
                feature_idx = node['feature_idx']
                threshold = node['threshold']

                if X[:, feature_idx] < threshold:
                    node = node['left']
                else:
                    node = node['right']

            proba += node['prediction']

        # Normalisasi probabilitas menggunakan softmax function
        proba = self._softmax(proba)

        return proba

    def predict(self, X):
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)
    
    

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Menggunakan OneHotEncoder untuk melakukan one-hot encoding pada label numerik
onehot_encoder = OneHotEncoder(sparse=False)
y_train_encoded = y_train_encoded.reshape(-1, 1)
y_train_onehot = onehot_encoder.fit_transform(y_train_encoded)

# Mengubah one-hot encoding kembali menjadi label kelas asli
y_train_restored = np.argmax(y_train_onehot, axis=1)

# Membuat dan melatih model XGBoost
xgb_classifier = XGBoostClassifier()
xgb_classifier.fit(X_train, y_train_restored)

Training progress:   0%|          | 0/3 [00:00<?, ?it/s]