In [None]:
# Load the Necessity 

In [72]:
%matplotlib notebook
%matplotlib inline
import numpy as np
from collections import Counter
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, accuracy_score, roc_curve, auc

In [34]:
#prepare the data

In [48]:
# Load dataset and split
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [49]:
# Define decision tree function

In [50]:
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._fit_tree(X, y, depth=0)
        
    def _fit_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        if depth >= self.max_depth or n_samples < self.min_samples_split or n_labels == 1:
            return DecisionNode(value=Counter(y).most_common(1)[0][0])
        
        best_feature, best_threshold = None, None
        best_gini = 1.0  # Initialize with maximum Gini impurity
        
        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                y_left = y[X[:, feature_index] <= threshold]
                y_right = y[X[:, feature_index] > threshold]
                
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                
                gini = self.gini_impurity(y_left, y_right)
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold
        
        if best_gini == 1.0:
            return DecisionNode(value=Counter(y).most_common(1)[0][0])
        
        true_indices = X[:, best_feature] <= best_threshold
        false_indices = X[:, best_feature] > best_threshold
        
        true_branch = self._fit_tree(X[true_indices], y[true_indices], depth + 1)
        false_branch = self._fit_tree(X[false_indices], y[false_indices], depth + 1)
        
        return DecisionNode(best_feature, best_threshold, value=None, true_branch=true_branch, false_branch=false_branch)

    def predict(self, X):
        return [self._predict_tree(x, self.root) for x in X]

    def _predict_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._predict_tree(x, node.true_branch)
        else:
            return self._predict_tree(x, node.false_branch)

    def gini_impurity(self, y_left, y_right):
        p_left = len(y_left) / (len(y_left) + len(y_right))
        p_right = len(y_right) / (len(y_left) + len(y_right))
        gini_left = 1.0 - sum((np.array(y_left) == c).mean() ** 2 for c in np.unique(y_left))
        gini_right = 1.0 - sum((np.array(y_right) == c).mean() ** 2 for c in np.unique(y_right))
        gini = p_left * gini_left + p_right * gini_right
        return gini


In [51]:
# Define Random Forest function

In [52]:
class RandomForestClassifier:
    def __init__(self, n_trees=100, max_depth=None, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            # Create a bootstrap sample
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]
            # Fit the tree on the bootstrap sample
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        # Perform majority voting to get the final predictions
        final_predictions = []
        for i in range(tree_predictions.shape[1]):
            unique, counts = np.unique(tree_predictions[:, i], return_counts=True)
            most_common_index = np.argmax(counts)
            final_predictions.append(unique[most_common_index])
        return final_predictions


In [53]:
# Test This

In [54]:
# Create and fit the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(max_depth=5)
dt_classifier.fit(X_train, y_train)

# Make predictions
predictions = dt_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [55]:
# Create and fit the Random Forest classifier
rf_classifier = RandomForestClassifier(n_trees=100, max_depth=5)
rf_classifier.fit(X_train, y_train)

# Make predictions
predictions = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96


In [56]:
# Let's Cheat

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Scikit-learn's RandomForestClassifier
sklearn_rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
sklearn_rf_classifier.fit(X_train, y_train)
sklearn_rf_predictions = sklearn_rf_classifier.predict(X_test)
sklearn_rf_accuracy = accuracy_score(y_test, sklearn_rf_predictions)

# Scikit-learn's DecisionTreeClassifier
sklearn_dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)
sklearn_dt_classifier.fit(X_train, y_train)
sklearn_dt_predictions = sklearn_dt_classifier.predict(X_test)
sklearn_dt_accuracy = accuracy_score(y_test, sklearn_dt_predictions)

print("Scikit-learn Random Forest Accuracy:", sklearn_rf_accuracy)
print("Scikit-learn Decision Tree Accuracy:", sklearn_dt_accuracy)

Scikit-learn Random Forest Accuracy: 0.9649122807017544
Scikit-learn Decision Tree Accuracy: 0.9473684210526315


In [58]:
# Lets see the predictions as ROC-AUC curve

In [62]:
# Calculate ROC-AUC scores
sklearn_dt_probabilities = sklearn_dt_classifier.predict_proba(X_test)
sklearn_rf_probabilities = sklearn_rf_classifier.predict_proba(X_test)

sklearn_dt_positive_probs = sklearn_dt_probabilities[:, 1]
sklearn_rf_positive_probs = sklearn_rf_probabilities[:, 1]

sklearn_rf_fpr, sklearn_rf_tpr, _ = roc_curve(y_test, sklearn_rf_positive_probs)
sklearn_rf_roc_auc = auc(sklearn_rf_fpr, sklearn_rf_tpr)

sklearn_dt_fpr, sklearn_dt_tpr, _ = roc_curve(y_test, sklearn_dt_positive_probs)
sklearn_dt_roc_auc = auc(sklearn_dt_fpr, sklearn_dt_tpr)

In [85]:
# Plot ROC curves
%matplotlib notebook
plt.ion()
plt.plot(sklearn_rf_fpr, sklearn_rf_tpr, color='darkorange', lw=2, label='RF (AUC = %0.2f)' % sklearn_rf_roc_auc)
plt.plot(sklearn_dt_fpr, sklearn_dt_tpr, color='navy', lw=2, label='DT (AUC = %0.2f)' % sklearn_dt_roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()


<IPython.core.display.Javascript object>