<a href="https://colab.research.google.com/github/abdulla41mamun/CSE711-SymbolicMachineLearning/blob/main/Adaboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np

# --- Helper Functions ---

# A function to calculate the weighted GINI impurity of a single node
def calculate_gini(y, sample_weight):
    classes = y.unique()
    total_weight = sample_weight.sum()
    if total_weight == 0:
        return 0
    impurity = 1.0
    for cls in classes:
        p_cls = sample_weight[y == cls].sum() / total_weight
        impurity -= p_cls**2
    return impurity

# A function to find the best stump by iterating through features
def find_best_stump_and_show_gini(df, y, sample_weight):
    """Finds the best stump and PRINTS the GINI calculation for each feature."""
    best_feature = None
    min_gini = float('inf')

    print("\n--- 🔎 Step 1: Calculating GINI Impurity for Each Feature ---")
    features = ['Outlook', 'Temperature', 'Humidity', 'Wind']
    gini_scores = {}

    for feature in features:
        weighted_gini = 0.0
        for value in df[feature].unique():
            subset_indices = df[feature] == value
            subset_y = y[subset_indices]
            subset_weights = sample_weight[subset_indices]
            branch_weight = subset_weights.sum() / sample_weight.sum()
            branch_gini = calculate_gini(subset_y, subset_weights)
            weighted_gini += branch_weight * branch_gini

        gini_scores[feature] = weighted_gini
        print(f"GINI({feature}) = {weighted_gini:.3f}")

    # Find the feature with the minimum GINI score
    best_feature = min(gini_scores, key=gini_scores.get)

    # Create the stump based on the best feature
    stump = {'feature': best_feature, 'predictions': {}}
    for value in df[best_feature].unique():
        subset_indices = df[best_feature] == value
        stump['predictions'][value] = 1 if y[subset_indices].dot(sample_weight[subset_indices]) >= 0 else -1

    return stump

# --- Main Script ---

# Set a seed for reproducibility
np.random.seed(42)

# Create the FULL, CORRECTED initial dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play Tennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)
y = df['Play Tennis'].apply(lambda x: 1 if x == 'Yes' else -1)
N = len(df)

# Initialize weights equally
df['sample_weight'] = 1 / N

print("--- Initial Dataset with Equal Weights ---")
print(df)

# =================================================================
# ITERATION 1
# =================================================================
print("\n" + "="*25 + " ITERATION 1 " + "="*25)

# Step 1: Find the best stump and show GINI calculations
stump_1 = find_best_stump_and_show_gini(df, y, df['sample_weight'])
print(f"✅ Best feature chosen: '{stump_1['feature']}' (Lowest GINI)")
print(f"   Stump's classification rules: {stump_1['predictions']}")

# Step 2: Calculate Error and Alpha
print("\n--- 🧮 Step 2: Calculating Error and 'Amount of Say' (alpha) ---")
predictions = df[stump_1['feature']].map(stump_1['predictions'])
misclassified_mask = (predictions != y)
total_error_1 = df['sample_weight'][misclassified_mask].sum()

print(f"The stump misclassified {misclassified_mask.sum()} samples.")
# FIXED THE LINE BELOW: Replaced the undefined 'num_incorrect' with the correct variable 'misclassified_mask.sum()'
print(f"Total Error = Sum of weights of misclassified samples = {misclassified_mask.sum()} * (1/14) = {total_error_1:.4f}")

epsilon = 1e-10
alpha_1 = 0.5 * np.log((1 - total_error_1) / (total_error_1 + epsilon))
print(f"Amount of Say (alpha_1) = 0.5 * ln((1 - {total_error_1:.2f}) / {total_error_1:.2f}) = {alpha_1:.4f}")

# Step 3: Update and Normalize Weights
print("\n--- ⚖️ Step 3: Updating and Normalizing Sample Weights ---")
# Calculate new un-normalized weights
new_weight_correct = (1/N) * np.exp(-alpha_1)
new_weight_incorrect = (1/N) * np.exp(alpha_1)
print(f"New weight for a CORRECTLY classified sample = (1/14) * e^(-{alpha_1:.2f}) = {new_weight_correct:.4f}")
print(f"New weight for an INCORRECTLY classified sample = (1/14) * e^({alpha_1:.2f}) = {new_weight_incorrect:.4f}")

# Calculate the SUM of all new weights to use for normalization
num_correct = N - misclassified_mask.sum()
num_incorrect = misclassified_mask.sum()
total_new_weight_sum = (num_correct * new_weight_correct) + (num_incorrect * new_weight_incorrect)
print(f"\nSum of all new weights = ({num_correct} * {new_weight_correct:.4f}) + ({num_incorrect} * {new_weight_incorrect:.4f}) = {total_new_weight_sum:.4f}")

# Calculate normalized weights
normalized_weight_correct = new_weight_correct / total_new_weight_sum
normalized_weight_incorrect = new_weight_incorrect / total_new_weight_sum
print(f"\nNormalized weight (Correct) = {new_weight_correct:.4f} / {total_new_weight_sum:.4f} = {normalized_weight_correct:.4f}")
print(f"Normalized weight (Incorrect) = {new_weight_incorrect:.4f} / {total_new_weight_sum:.4f} = {normalized_weight_incorrect:.4f}")

# Apply these normalized weights to the dataframe
df['normalized_weight'] = np.where(misclassified_mask, normalized_weight_incorrect, normalized_weight_correct)

# Step 4: Create New Dataset via Resampling
print("\n--- 🎲 Step 4: Creating New Dataset with Random Sampling ---")
df['cumulative_weight'] = df['normalized_weight'].cumsum()
print("Cumulative Weight Table for 'Roulette Wheel' Selection:")
print(df[['Play Tennis', 'normalized_weight', 'cumulative_weight']].round(4))

# Generate 14 random numbers
random_numbers = np.random.rand(N)
print(f"\nGenerated 14 random numbers: \n{np.round(random_numbers, 4)}")

# Find which sample gets selected for each random number
new_indices = []
for r in random_numbers:
    selected_index = df[df['cumulative_weight'] > r].index[0]
    new_indices.append(selected_index)

print("\nMapping random numbers to new dataset samples:")
for i in range(N):
    print(f"  Random # {i+1} ({random_numbers[i]:.4f}) -> selects original sample at index {new_indices[i]}")

# Create the new dataframe
df_new = df.iloc[new_indices].copy()
df_new = df_new.drop(columns=['sample_weight', 'normalized_weight', 'cumulative_weight'])
df_new = df_new.reset_index(drop=True)

print("\n✅ New Resampled Dataset for Iteration 2:")
print(df_new)
print("\nThis new dataset, which over-samples the 'hard' cases, would now be used to train the second stump.")

--- Initial Dataset with Equal Weights ---
     Outlook Temperature Humidity    Wind Play Tennis  sample_weight
0      Sunny         Hot     High    Weak          No       0.071429
1      Sunny         Hot     High  Strong          No       0.071429
2   Overcast         Hot     High    Weak         Yes       0.071429
3       Rain        Mild     High    Weak         Yes       0.071429
4       Rain        Cool   Normal    Weak         Yes       0.071429
5       Rain        Cool   Normal  Strong          No       0.071429
6   Overcast        Cool   Normal  Strong         Yes       0.071429
7      Sunny        Mild     High    Weak          No       0.071429
8      Sunny        Cool   Normal    Weak         Yes       0.071429
9       Rain        Mild   Normal    Weak         Yes       0.071429
10     Sunny        Mild   Normal  Strong         Yes       0.071429
11  Overcast        Mild     High  Strong         Yes       0.071429
12  Overcast         Hot   Normal    Weak         Yes       