In [6]:
import numpy as np
X = np.array([[4.57, 5.79, 3.87, 5.52],
              [4.67, 4.8 , 6.14, 5.75],
              [4.13, 4.87, 4.47, 4.21],
              [5.93, 4.09, 4.17, 3.44],
              [4.64, 4.04, 4.04, 4.57],
              [3.19, 3.31, 4.36, 5.88],
              [6.58, 5.16, 6.7 , 5.46],
              [4.85, 5.03, 5.01, 5.15],
              [4.65, 6.12, 6.43, 4.36],
              [4.28, 6.04, 4.69, 4.21]])
y = np.array([11.8, 53.2, -125.9, -109.6, -115.5, -134.9, 178.8, 3., 19.2, -63.7])
# Calculate SST
y_mean = np.mean(y)
SST = np.sum((y - y_mean) ** 2)

# Tree One Split: x1 <= 4.8
region1_tree1_indices = np.where(X[:, 0] <= 4.5)[0]
region2_tree1_indices = np.where(X[:, 0] > 4.5)[0]

region1_tree1_y = y[region1_tree1_indices]
region2_tree1_y = y[region2_tree1_indices]

region1_tree1_mean = np.mean(region1_tree1_y)
region2_tree1_mean = np.mean(region2_tree1_y)

SSE_tree1 = np.sum((region1_tree1_y - region1_tree1_mean) ** 2) + np.sum((region2_tree1_y - region2_tree1_mean) ** 2)

# Tree Two Split: x2 <= 4.9
region1_tree2_indices = np.where(X[:, 2] <= 4.4)[0]
region2_tree2_indices = np.where(X[:, 2] > 4.4)[0]

region1_tree2_y = y[region1_tree2_indices]
region2_tree2_y = y[region2_tree2_indices]

region1_tree2_mean = np.mean(region1_tree2_y)
region2_tree2_mean = np.mean(region2_tree2_y)

SSE_tree2 = np.sum((region1_tree2_y - region1_tree2_mean) ** 2) + np.sum((region2_tree2_y - region2_tree2_mean) ** 2)

SST, SSE_tree1, SSE_tree2

(90732.584, 63436.42380952381, 67769.14333333334)

In [7]:
import numpy as np

# Data
x = np.array([0.6, 0.8, 1.9, 2.0, 2.4, 2.5, 3.6, 4.3, 4.6, 6.3, 8.5, 9.0])
y = np.array([2, 1, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0])

# Helper function to calculate proportions
def get_proportions(y):
    counts = np.array([np.sum(y == i) for i in np.unique(y)])
    return counts / len(y)

# Helper function to calculate Gini impurity
def gini_impurity(y):
    proportions = get_proportions(y)
    return 1 - np.sum(proportions ** 2)

# Split at x <= 2.3
left_indices = np.where(x <= 2.3)[0]
right_indices = np.where(x > 2.3)[0]

y_left = y[left_indices]
y_right = y[right_indices]

# Calculating Gini impurities
root_gini = gini_impurity(y)
left_gini = gini_impurity(y_left)
right_gini = gini_impurity(y_right)

# Weighted average of left and right Gini impurities (Tree Gini)
tree_gini = (len(y_left) / len(y)) * left_gini + (len(y_right) / len(y)) * right_gini

root_gini, left_gini, right_gini, tree_gini


(0.6111111111111112, 0.625, 0.53125, 0.5625)

In [8]:
import numpy as np

# Data
x = np.array([0.1, 0.3, 0.9, 1.7, 2.3, 4.0, 4.1, 6.0, 7.8, 8.5, 8.6, 9.6])
y = np.array([2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 0, 1])

# Helper function to calculate proportions
def get_proportions(y):
    counts = np.array([np.sum(y == i) for i in np.unique(y)])
    return counts / len(y)

# Helper function to calculate entropy
def entropy(y):
    proportions = get_proportions(y)
    return -np.sum(proportions * np.log2(proportions + 1e-9))  # Add small value to avoid log(0)

# Split data at x <= 3.8
left_indices = np.where(x <= 3.8)[0]
right_indices = np.where(x > 3.8)[0]

y_left = y[left_indices]
y_right = y[right_indices]

# Calculate entropies
root_entropy = entropy(y)
left_entropy = entropy(y_left)
right_entropy = entropy(y_right)

# Weighted average entropy for the tree (Tree Entropy)
tree_entropy = (len(y_left) / len(y)) * left_entropy + (len(y_right) / len(y)) * right_entropy

print("Root Node Entropy:", root_entropy)
print("Left Node Entropy:", left_entropy)
print("Right Node Entropy:", right_entropy)
print("Tree Entropy:", tree_entropy)


Root Node Entropy: 1.4591479126991596
Left Node Entropy: 1.3709505901265835
Right Node Entropy: 1.4488156313970995
Tree Entropy: 1.4163718642010512


In [22]:
from scipy.stats import norm
mean = 7
std = 3.2
x1 = 4.55
x2 = 8.98
x3 = 3.23
x4 = 7.42
p1 = (x1 - mean) / std
p2 = (x2 - mean) / std
p2 = 1 - norm.cdf(p2)
p3_1 = (x3 - mean) / std
p3_2 = (x4 - mean) / std
p3 = norm.cdf(p3_2) - norm.cdf(p3_1)
p3

0.43283783975218554

In [33]:
import numpy as np
X = np.array([[ 3.08,  7.47],
              [ 2.41,  7.86],
              [ 1.52,  8.96],
              [ 2.93,  7.13],
              [  np.nan,  9.25],
              [ 3.  ,   np.nan],
              [ 1.21,  9.44],
              [ 1.32,  9.29],
              [ 2.94,  7.53],
              [ 3.85, 10.59]])

mean1 = np.nanmean(X[:, 0])
mean1
mean2 = np.nanmean(X[:, 1])
mean2

8.613333333333333

In [3]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# Training data
data = np.array([
    [6.09, 0.28, 0.63],
    [5.08, 0.50, 0.12],
    [6.23, 0.68, 0.32],
    [6.51, 0.47, 0.58],
    [4.86, 0.19, 0.34]
])

# Splitting features (X) and target (y)
y = data[:, :-1]  # Features: first two columns
x = data[:, -1]   # Target: last column

# Initialize KNN regressor
knn = KNeighborsRegressor(n_neighbors=4)

# Set up 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Define MAE scorer
mae_scorer = make_scorer(mean_absolute_error)

# Perform cross-validation and calculate MAE
mae_scores = cross_val_score(knn, X, y, cv=kf, scoring=mae_scorer)

# Calculate mean MAE across all folds
mean_mae = mae_scores.mean()

print("5-Fold Cross-Validated MAE:", mean_mae)


5-Fold Cross-Validated MAE: 0.48649999999999993


In [5]:
import numpy as np
x = np.array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
y = np.array([ 3. ,  4.6,  8.2, 10.4, 11.1, 14. , 16.2, 18.3, 19.5, 20.6])
ymean = y.mean()
ymean

12.59