<a href="https://colab.research.google.com/github/abdulla41mamun/CSE711-SymbolicMachineLearning/blob/main/Lecture5_GradientBoostingRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np

# Set display options for pandas for better visibility
pd.set_option('display.float_format', '{:.7f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

def run_regression_example():
    """
    This function implements the Gradient Boosting for Regression example
    as shown in the presentation slides.
    """
    print("="*50)
    print(" Gradient Boosting for Regression Example")
    print("="*50)

    # --- Step 1: Initial Data and First Prediction ---
    # As per Slide 2
    data = {
        'Age': [8, 10, 12, 30, 35, 60],
        'Place of living': ['Village', 'City', 'Village', 'Village', 'City', 'City'],
        'IQ-score': [1, 3, 1, 4, 5, 2]
    }
    df = pd.DataFrame(data)
    print("\n--- Initial Dataset (Slide 2) ---\n")
    print(df)

    # Calculate the initial prediction: the average of the target variable
    # As per Slide 3
    initial_prediction = df['IQ-score'].mean()
    df['Predicted_IQ'] = initial_prediction
    df['Pseudo_Residual_1'] = df['IQ-score'] - df['Predicted_IQ']

    print(f"\n\n--- Step 1: Initial Prediction and Residuals (Slide 3) ---")
    print(f"Initial Prediction (Average IQ-Score): {initial_prediction:.7f}")
    print("\nDataFrame with Initial Prediction and Residuals:\n")
    print(df[['Age', 'Place of living', 'IQ-score', 'Predicted_IQ', 'Pseudo_Residual_1']])

    # --- Step 2: Build the First Decision Tree ---
    # We need to find the best split for 'Age' that minimizes the Sum of Squared Residuals (SSR)
    # As per Slide 5
    print("\n\n--- Step 2: Building the First Tree (Finding best split for 'Age') (Slide 5) ---")
    ages = sorted(df['Age'].unique())
    split_points = [(ages[i] + ages[i+1]) / 2 for i in range(len(ages)-1)]
    min_ssr = float('inf')
    best_split = None

    print("\nCalculating Sum of Squared Residuals (SSR) for each potential split:")
    for split in split_points:
        left_mask = df['Age'] < split
        right_mask = df['Age'] >= split

        avg_left = df.loc[left_mask, 'Pseudo_Residual_1'].mean()
        avg_right = df.loc[right_mask, 'Pseudo_Residual_1'].mean()

        ssr_left = ((df.loc[left_mask, 'Pseudo_Residual_1'] - avg_left)**2).sum()
        ssr_right = ((df.loc[right_mask, 'Pseudo_Residual_1'] - avg_right)**2).sum()
        total_ssr = ssr_left + ssr_right

        print(f"Split at Age < {split}: Left Avg={avg_left:.7f}, Right Avg={avg_right:.7f}, Total SSR={total_ssr:.7f}")

        if total_ssr < min_ssr:
            min_ssr = total_ssr
            best_split = split

    print(f"\nBest Split for 'Age' is at: {best_split} with a minimum SSR of {min_ssr:.7f} (Matches Slide 8)")

    # Calculate leaf node output values (average of residuals in each leaf)
    # As per Slide 10
    left_leaf_value_1 = df.loc[df['Age'] < best_split, 'Pseudo_Residual_1'].mean()
    right_leaf_value_1 = df.loc[df['Age'] >= best_split, 'Pseudo_Residual_1'].mean()
    print(f"\nFirst Tree Leaf Values (Slide 10):")
    print(f"  - Leaf 1 (Age < {best_split}): {left_leaf_value_1:.7f}")
    print(f"  - Leaf 2 (Age >= {best_split}): {right_leaf_value_1:.7f}")

    # --- Step 3: Update Predictions and Calculate New Residuals ---
    # As per Slide 11
    learning_rate = 0.1
    df['Tree1_Output'] = np.where(df['Age'] < best_split, left_leaf_value_1, right_leaf_value_1)
    df['Predicted_IQ_2'] = df['Predicted_IQ'] + learning_rate * df['Tree1_Output']
    df['Pseudo_Residual_2'] = df['IQ-score'] - df['Predicted_IQ_2']

    print(f"\n\n--- Step 3: Update Predictions with First Tree (Learning Rate = {learning_rate}) (Slide 11) ---")
    print("\nDataFrame with Updated Predictions and New Residuals:\n")
    print(df[['IQ-score', 'Predicted_IQ', 'Tree1_Output', 'Predicted_IQ_2', 'Pseudo_Residual_2']])

    # --- Step 4: Build the Second Decision Tree ---
    # As per Slide 12
    print("\n\n--- Step 4: Building the Second Tree (Finding best split for 'Age') (Slide 12) ---")
    min_ssr_2 = float('inf')
    best_split_2 = None

    print("\nCalculating SSR for each potential split using new residuals:")
    for split in split_points:
        left_mask = df['Age'] < split
        right_mask = df['Age'] >= split

        avg_left = df.loc[left_mask, 'Pseudo_Residual_2'].mean()
        avg_right = df.loc[right_mask, 'Pseudo_Residual_2'].mean()

        ssr_left = ((df.loc[left_mask, 'Pseudo_Residual_2'] - avg_left)**2).sum()
        ssr_right = ((df.loc[right_mask, 'Pseudo_Residual_2'] - avg_right)**2).sum()
        total_ssr = ssr_left + ssr_right

        print(f"Split at Age < {split}: Left Avg={avg_left:.7f}, Right Avg={avg_right:.7f}, Total SSR={total_ssr:.7f}")

        if total_ssr < min_ssr_2:
            min_ssr_2 = total_ssr
            best_split_2 = split

    print(f"\nBest Split for 'Age' in Second Tree is at: {best_split_2} with a minimum SSR of {min_ssr_2:.7f} (Matches Slide 12)")

    # Calculate leaf node output values for the second tree
    # As per Slide 14
    left_leaf_value_2 = df.loc[df['Age'] < best_split_2, 'Pseudo_Residual_2'].mean()
    right_leaf_value_2 = df.loc[df['Age'] >= best_split_2, 'Pseudo_Residual_2'].mean()
    print(f"\nSecond Tree Leaf Values (Slide 14):")
    print(f"  - Leaf 1 (Age < {best_split_2}): {left_leaf_value_2:.7f}")
    print(f"  - Leaf 2 (Age >= {best_split_2}): {right_leaf_value_2:.7f}")

    # --- Step 5: Final Prediction ---
    # As per Slide 16
    df['Tree2_Output'] = np.where(df['Age'] < best_split_2, left_leaf_value_2, right_leaf_value_2)
    df['Predicted_IQ_Final'] = df['Predicted_IQ_2'] + learning_rate * df['Tree2_Output']

    print(f"\n\n--- Step 5: Final Predictions after Second Tree (Slide 16) ---")
    print("\nFinal DataFrame:\n")
    print(df[['Age', 'IQ-score', 'Predicted_IQ', 'Predicted_IQ_2', 'Predicted_IQ_Final']])


def run_classification_example():
    """
    This function implements the Gradient Boosting for Classification example
    as shown in the presentation slides.
    """
    print("\n\n" + "="*50)
    print(" Gradient Boosting for Classification Example")
    print("="*50)

    # --- Step 1: Initial Data and First Prediction ---
    # As per Slide 17
    data = {
        'Age': [8, 10, 12, 30, 35, 60],
        'Place of living': ['Village', 'City', 'Village', 'Village', 'City', 'City'],
        'IQ-score_cat': ['Low', 'Low', 'Low', 'High', 'High', 'Low']
    }
    df = pd.DataFrame(data)
    # Encode target variable: Low = 1, High = 0
    df['Y'] = np.where(df['IQ-score_cat'] == 'Low', 1, 0)
    print("\n--- Initial Dataset (Slide 17) ---\n")
    print(df)

    # Calculate initial log(odds)
    # As per Slide 19
    num_low = df[df['Y'] == 1].shape[0]
    num_high = df[df['Y'] == 0].shape[0]
    initial_log_odds = np.log(num_low / num_high)

    # Calculate initial probability
    initial_prob = np.exp(initial_log_odds) / (1 + np.exp(initial_log_odds))

    df['Log_Odds_1'] = initial_log_odds
    df['Prob_1'] = initial_prob
    df['Residual_1'] = df['Y'] - df['Prob_1']

    print(f"\n\n--- Step 1: Initial Prediction and Residuals (Slide 19) ---")
    print(f"Initial Log(Odds): {initial_log_odds:.7f}")
    print(f"Initial Probability: {initial_prob:.7f}")
    print("\nDataFrame with Initial Prediction and Residuals:\n")
    print(df[['Age', 'Y', 'Log_Odds_1', 'Prob_1', 'Residual_1']])

    # --- Step 2: Build the First Decision Tree ---
    # As per Slide 20
    print("\n\n--- Step 2: Building the First Tree (Finding best split for 'Age') (Slide 20) ---")
    ages = sorted(df['Age'].unique())
    split_points = [(ages[i] + ages[i+1]) / 2 for i in range(len(ages)-1)]
    min_ssr = float('inf')
    best_split = None

    print("\nCalculating Sum of Squared Residuals (SSR) for each potential split:")
    for split in split_points:
        left_mask = df['Age'] < split
        right_mask = df['Age'] >= split

        avg_left = df.loc[left_mask, 'Residual_1'].mean()
        avg_right = df.loc[right_mask, 'Residual_1'].mean()

        ssr_left = ((df.loc[left_mask, 'Residual_1'] - avg_left)**2).sum()
        ssr_right = ((df.loc[right_mask, 'Residual_1'] - avg_right)**2).sum()
        total_ssr = ssr_left + ssr_right

        print(f"Split at Age < {split}: Left Avg={avg_left:.7f}, Right Avg={avg_right:.7f}, Total SSR={total_ssr:.7f}")

        if total_ssr < min_ssr:
            min_ssr = total_ssr
            best_split = split

    print(f"\nBest Split for 'Age' is at: {best_split} with a minimum SSR of {min_ssr:.7f} (Matches Slide 21)")

    # --- Step 3: Calculate Leaf Output Values ---
    # As per Slide 22
    print("\n\n--- Step 3: Calculating Leaf Output Values (Slide 22) ---")
    left_mask = df['Age'] < best_split
    right_mask = df['Age'] >= best_split

    numerator_left = df.loc[left_mask, 'Residual_1'].sum()
    denominator_left = (df.loc[left_mask, 'Prob_1'] * (1 - df.loc[left_mask, 'Prob_1'])).sum()
    leaf_value_left = numerator_left / denominator_left

    numerator_right = df.loc[right_mask, 'Residual_1'].sum()
    denominator_right = (df.loc[right_mask, 'Prob_1'] * (1 - df.loc[right_mask, 'Prob_1'])).sum()
    leaf_value_right = numerator_right / denominator_right

    print(f"First Tree Leaf Values:")
    print(f"  - Leaf 1 (Age < {best_split}): {leaf_value_left:.7f} (Matches ~1.49-1.50 on Slide 23)")
    print(f"  - Leaf 2 (Age >= {best_split}): {leaf_value_right:.7f} (Matches ~-1.52 on Slide 23)")

    # --- Step 4: Update Log(Odds) and Probabilities ---
    # As per Slide 24
    learning_rate = 0.6 # From slide 23
    df['Tree1_Output'] = np.where(df['Age'] < best_split, leaf_value_left, leaf_value_right)
    df['Log_Odds_2'] = df['Log_Odds_1'] + learning_rate * df['Tree1_Output']
    df['Prob_2'] = np.exp(df['Log_Odds_2']) / (1 + np.exp(df['Log_Odds_2']))
    df['Residual_2'] = df['Y'] - df['Prob_2']

    print(f"\n\n--- Step 4: Update Log(Odds) and Probabilities (Learning Rate = {learning_rate}) (Slide 24) ---")
    print("\nFinal DataFrame with updated probabilities:\n")
    # To match the slide's final table format
    final_output = df[['Age', 'Place of living', 'IQ-score_cat', 'Y', 'Prob_2', 'Residual_2', 'Log_Odds_2']]
    final_output = final_output.rename(columns={
        'IQ-score_cat': 'IQ-score',
        'Y': 'Encoded IQ-score (Y)',
        'Prob_2': 'Predicted Probability',
        'Residual_2': 'Residual',
        'Log_Odds_2': 'log(Odds)'
    })
    print(final_output)


if __name__ == '__main__':
    run_regression_example()
    run_classification_example()


 Gradient Boosting for Regression Example

--- Initial Dataset (Slide 2) ---

   Age Place of living  IQ-score
0    8         Village         1
1   10            City         3
2   12         Village         1
3   30         Village         4
4   35            City         5
5   60            City         2


--- Step 1: Initial Prediction and Residuals (Slide 3) ---
Initial Prediction (Average IQ-Score): 2.6666667

DataFrame with Initial Prediction and Residuals:

   Age Place of living  IQ-score  Predicted_IQ  Pseudo_Residual_1
0    8         Village         1     2.6666667         -1.6666667
1   10            City         3     2.6666667          0.3333333
2   12         Village         1     2.6666667         -1.6666667
3   30         Village         4     2.6666667          1.3333333
4   35            City         5     2.6666667          2.3333333
5   60            City         2     2.6666667         -0.6666667


--- Step 2: Building the First Tree (Finding best split for 'Age')