> Classification & Regression Tree.

In [2]:
# Perform one-hot encoding for categorical variables
data_encoded = pd.get_dummies(
    data, columns=['Interactive', 'Practical Knowledge', 'Common Skills'])

# Separate features (X) and target variable (y)
X = data_encoded.drop('Job Offers', axis=1)
y = data_encoded['Job Offers']

In [19]:
import pandas as pd
import numpy as np

class Node:
    def __init__(self, feature_index=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch

class Tree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self.build_tree(X, y, depth=0)

    def predict(self, X):
        return [self.predict_sample(x, self.root) for x in X]
    
    def predict_sample(self, sample, node):
        print(f"Node: {node.feature_index}, {node.threshold}, {node.value}")
        if node.value is not None:
            return node.value
        if node.feature_index is None or node.feature_index >= len(sample):
            print("Invalid feature index or sample length")
            return None
        if sample[node.feature_index] <= node.threshold:
            print("Going to true branch")
            return self.predict_sample(sample, node.true_branch)
        else:
            print("Going to false branch")
            return self.predict_sample(sample, node.false_branch)

    def build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        num_classes = len(set(y))

        if depth >= self.max_depth or num_samples < self.min_samples_split or num_classes == 1:
            return Node(value=self.calculate_leaf_value(y))

        best_split = self.find_best_splitting_subset(X, y)

        if best_split is None:
            return Node(value=self.calculate_leaf_value(y))

        feature_index, threshold = best_split
        true_samples_mask = X[:, feature_index] <= threshold

        true_branch = self.build_tree(X[true_samples_mask], y[true_samples_mask], depth + 1)
        false_branch = self.build_tree(X[~true_samples_mask], y[~true_samples_mask], depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, true_branch=true_branch, false_branch=false_branch)

    def find_best_splitting_subset(self, X, y):
        best_gini = float('inf')
        best_split = None
        num_samples, num_features = X.shape

        for feature_index in range(num_features):
            thresholds = sorted(set(X[:, feature_index]))

            for threshold in thresholds:
                true_samples_mask = X[:, feature_index] <= threshold
                y_true = y[true_samples_mask]
                y_false = y[~true_samples_mask]

                gini = self.gini_impurity(y_true, y_false)
                if gini < best_gini:
                    best_gini = gini
                    best_split = (feature_index, threshold)

        return best_split

    def gini_impurity(self, y_true, y_false):
        num_samples = len(y_true) + len(y_false)
        impurity = 0

        for y_subset in (y_true, y_false):
            subset_size = len(y_subset)
            if subset_size == 0:
                continue
            class_counts = {c: sum(y_subset == c) for c in set(y_subset)}
            subset_impurity = 1 - sum((count / subset_size) ** 2 for count in class_counts.values())
            impurity += (subset_size / num_samples) * subset_impurity

        return impurity

    def calculate_leaf_value(self, y):
        class_counts = {c: sum(y == c) for c in set(y)}
        return max(class_counts, key=class_counts.get)

# Example usage:

# Load data
data = pd.read_csv('demo.csv')
data_encoded = pd.get_dummies(data, columns=['Interactive', 'Practical Knowledge', 'Common Skills'])
X = data_encoded.drop('Job Offers', axis=1)
y = data_encoded['Job Offers']

# Create and fit the tree
decision_tree = Tree(max_depth=3, min_samples_split=2)
decision_tree.fit(X.to_numpy(), y.to_numpy())

# Define test cases
test_cases = [
    [3.5, 1, 0, 0, 1, 0],  # CGPA=3.5, Interactive=Yes, Practical Knowledge=Good, Common Skills=Good
    [2.7, 0, 1, 1, 0, 0],  # CGPA=2.7, Interactive=No, Practical Knowledge=Moderate, Common Skills=Average
    [3.9, 1, 1, 0, 0, 1],  # CGPA=3.9, Interactive=Yes, Practical Knowledge=Very Good, Common Skills=Poor
]

# Predict and print results
for idx, test_case in enumerate(test_cases):
    prediction = decision_tree.predict([test_case])
    print(f"Test Case {idx + 1}: {'Offer' if prediction[0] else 'No Offer'}")


Node: 0, 3.3, None
Going to false branch
Node: None, None, Yes
Test Case 1: Offer
Node: 0, 3.3, None
Going to true branch
Node: 6, False, None
Invalid feature index or sample length
Test Case 2: No Offer
Node: 0, 3.3, None
Going to false branch
Node: None, None, Yes
Test Case 3: Offer


In [22]:
# Assuming you have a test dataset X_test and y_test
X_test = np.array([
    [3.5, 1, 0, 0, 1, 0],  # Sample 1
    [2.8, 0, 1, 1, 0, 0],  # Sample 2
    [3.9, 1, 1, 0, 0, 1],  # Sample 3
])

y_test = np.array([1, 0, 1])  # Corresponding labels for the test samples

# Create and fit the decision tree model
decision_tree = Tree(max_depth=3, min_samples_split=2)
decision_tree.fit(X.to_numpy(), y.to_numpy())

# Use the decision tree model to predict on the test dataset
predictions = decision_tree.predict(X_test)

# Calculate accuracy
accuracy = calculate_accuracy(y_test, predictions)

print(f"Accuracy: {accuracy * 100:.2f}%")


Node: 0, 3.3, None
Going to false branch
Node: None, None, Yes
Node: 0, 3.3, None
Going to true branch
Node: 6, False, None
Invalid feature index or sample length
Node: 0, 3.3, None
Going to false branch
Node: None, None, Yes
Accuracy: 0.00%
