<a href="https://colab.research.google.com/github/bcdanl/320-code/blob/main/danl_320_script_2025_0224.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classwork 8 - Dummy Variable Linear Regression

In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate  # for table summary
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm  # for lowess smoothing

from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col, pow, mean, when, log
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.master("local[*]").getOrCreate()

# UDF for Regression Table

In [2]:
def regression_table(model, assembler):
    """
    Creates a formatted regression table from a fitted LinearRegression model and its VectorAssembler,
    and inserts a dashed horizontal line after the Intercept row. The table includes separate columns
    for the 95% confidence interval lower and upper bounds for each coefficient (computed at the 5% significance level)
    and an "Observations" row (using model.summary.numInstances) above the R² row.
    The RMSE row is placed as the last row.

    The columns are ordered as:
        Metric | Value | Significance | Std. Error | p-value | 95% CI Lower | 95% CI Upper

    For the "Value", "Std. Error", "95% CI Lower", and "95% CI Upper" columns, commas are inserted every three digits,
    with 3 decimal places (except for Observations which is formatted as an integer with commas).

    Parameters:
        model: A fitted LinearRegression model (with a .summary attribute).
        assembler: The VectorAssembler used to assemble the features for the model.

    Returns:
        A formatted string containing the regression table.
    """
    # Extract coefficients and standard errors as NumPy arrays
    coeffs = model.coefficients.toArray()
    std_errors_all = np.array(model.summary.coefficientStandardErrors)

    # Check if the intercept's standard error is included (one extra element)
    if len(std_errors_all) == len(coeffs) + 1:
        intercept_se = std_errors_all[0]
        std_errors = std_errors_all[1:]
    else:
        intercept_se = None
        std_errors = std_errors_all

    # Compute t-statistics for feature coefficients (t = beta / SE(beta))
    # t_stats = coeffs / std_errors
    t_stats = model.summary.tValues

    # Degrees of freedom: number of instances minus number of predictors minus 1 (for intercept)
    df = model.summary.numInstances - len(coeffs) - 1

    # Compute the t-critical value for a 95% confidence interval (two-tailed, 5% significance)
    t_critical = stats.t.ppf(0.975, df)

    # Compute two-tailed p-values for each feature coefficient
    # p_values = [2 * (1 - stats.t.cdf(np.abs(t), df)) for t in t_stats]
    p_values = model.summary.pValues

    # Function to assign significance stars based on p-value
    def significance_stars(p):
        if p < 0.01:
            return "***"
        elif p < 0.05:
            return "**"
        elif p < 0.1:
            return "*"
        else:
            return ""

    # Build the table rows.
    # Order: Metric, Value, Significance, Std. Error, p-value, 95% CI Lower, 95% CI Upper.
    table = []
    for feature, beta, se, p in zip(assembler.getInputCols(), coeffs, std_errors, p_values):
        ci_lower = beta - t_critical * se
        ci_upper = beta + t_critical * se
        table.append([
            "Beta: " + feature,       # Metric name
            beta,                     # Beta estimate (Value)
            significance_stars(p),    # Significance stars
            se,                       # Standard error
            p,                        # p-value
            ci_lower,                 # 95% CI lower bound
            ci_upper                  # 95% CI upper bound
        ])

    # Compute and add the intercept row with its SE, p-value, significance, and CI (if available)
    if intercept_se is not None:
        intercept_t = model.intercept / intercept_se
        intercept_p = 2 * (1 - stats.t.cdf(np.abs(intercept_t), df))
        intercept_sig = significance_stars(intercept_p)
        ci_intercept_lower = model.intercept - t_critical * intercept_se
        ci_intercept_upper = model.intercept + t_critical * intercept_se
    else:
        intercept_se = ""
        intercept_p = ""
        intercept_sig = ""
        ci_intercept_lower = ""
        ci_intercept_upper = ""

    table.append([
        "Intercept",
        model.intercept,
        intercept_sig,
        intercept_se,
        intercept_p,
        ci_intercept_lower,
        ci_intercept_upper
    ])

    # Append overall model metrics:
    # Insert an Observations row using model.summary.numInstances,
    # then an R² row, and finally the RMSE row as the last row.
    table.append(["Observations", model.summary.numInstances, "", "", "", "", ""])
    table.append(["R²", model.summary.r2, "", "", "", "", ""])
    table.append(["RMSE", model.summary.rootMeanSquaredError, "", "", "", "", ""])

    # Format the table.
    # For the "Value" (index 1), "Std. Error" (index 3), "95% CI Lower" (index 5), and "95% CI Upper" (index 6) columns,
    # format with commas and 3 decimal places, except for Observations which should be an integer with commas.
    # For the p-value (index 4), format to 3 decimal places.
    formatted_table = []
    for row in table:
        formatted_row = []
        for i, item in enumerate(row):
            if row[0] == "Observations" and i == 1 and isinstance(item, (int, float, np.floating)) and item != "":
                # Format Observations as integer with commas, no decimals.
                formatted_row.append(f"{int(item):,}")
            elif isinstance(item, (int, float, np.floating)) and item != "":
                if i in [1, 3, 5, 6]:
                    formatted_row.append(f"{item:,.3f}")
                elif i == 4:
                    formatted_row.append(f"{item:.3f}")
                else:
                    formatted_row.append(f"{item:.3f}")
            else:
                formatted_row.append(item)
        formatted_table.append(formatted_row)

    # Generate the table string using tabulate.
    table_str = tabulate(
        formatted_table,
        headers=["Metric", "Value", "Sig.", "Std. Error", "p-value", "95% CI Lower", "95% CI Upper"],
        tablefmt="pretty",
        colalign=("left", "right", "center", "right", "right", "right", "right")
    )

    # Insert a dashed line after the Intercept row for clarity.
    lines = table_str.split("\n")
    dash_line = '-' * len(lines[0])
    for i, line in enumerate(lines):
        if "Intercept" in line and not line.strip().startswith('+'):
            lines.insert(i+1, dash_line)
            break

    return "\n".join(lines)

# Example usage:
# print(regression_table(MODEL, ASSEMBLER))

## UDF for dummies

In [3]:
def add_dummy_variables(var_name, reference_level, category_order=None):
    """
    Creates dummy variables for the specified column in the global DataFrames dtrain and dtest.
    Allows manual setting of category order.

    Parameters:
        var_name (str): The name of the categorical column (e.g., "borough_name").
        reference_level (int): Index of the category to be used as the reference (dummy omitted).
        category_order (list, optional): List of categories in the desired order. If None, categories are sorted.

    Returns:
        dummy_cols (list): List of dummy column names excluding the reference category.
        ref_category (str): The category chosen as the reference.
    """
    global dtrain, dtest

    # Get distinct categories from the training set.
    categories = dtrain.select(var_name).distinct().rdd.flatMap(lambda x: x).collect()

    # Convert booleans to strings if present.
    categories = [str(c) if isinstance(c, bool) else c for c in categories]

    # Use manual category order if provided; otherwise, sort categories.
    if category_order:
        # Ensure all categories are present in the user-defined order
        missing = set(categories) - set(category_order)
        if missing:
            raise ValueError(f"These categories are missing from your custom order: {missing}")
        categories = category_order
    else:
        categories = sorted(categories)

    # Validate reference_level
    if reference_level < 0 or reference_level >= len(categories):
        raise ValueError(f"reference_level must be between 0 and {len(categories) - 1}")

    # Define the reference category
    ref_category = categories[reference_level]
    print("Reference category (dummy omitted):", ref_category)

    # Create dummy variables for all categories
    for cat in categories:
        dummy_col_name = var_name + "_" + str(cat).replace(" ", "_")
        dtrain = dtrain.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
        dtest = dtest.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))

    # List of dummy columns, excluding the reference category
    dummy_cols = [var_name + "_" + str(cat).replace(" ", "_") for cat in categories if cat != ref_category]

    return dummy_cols, ref_category


# Example usage without category_order:
# dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)

# Example usage with category_order:
# custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
# dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)

# Classwork 8 - Dummy Variable Regression

## Q1

In [4]:
# 1. Read CSV data from URL
df_pd = pd.read_csv('https://bcdanl.github.io/data/bikeshare_cleaned.csv')
df = spark.createDataFrame(df_pd)
df.show()

+---+----+-----+----+---+--------+-------+-------+--------------------+------------------+-----------------+------------------+
|cnt|year|month|date| hr|   wkday|holiday|seasons|        weather_cond|              temp|              hum|         windspeed|
+---+----+-----+----+---+--------+-------+-------+--------------------+------------------+-----------------+------------------+
| 16|2011|    1|   1|  0|saturday|      0| spring| Clear or Few Cloudy| -1.33460918694128|0.947345243330896|  -1.5538438052971|
| 40|2011|    1|   1|  1|saturday|      0| spring| Clear or Few Cloudy| -1.43847500990342|0.895512927978679|  -1.5538438052971|
| 32|2011|    1|   1|  2|saturday|      0| spring| Clear or Few Cloudy| -1.43847500990342|0.895512927978679|  -1.5538438052971|
| 13|2011|    1|   1|  3|saturday|      0| spring| Clear or Few Cloudy| -1.33460918694128|0.636351351217591|  -1.5538438052971|
|  1|2011|    1|   1|  4|saturday|      0| spring| Clear or Few Cloudy| -1.33460918694128|0.636351351217

In [7]:
df.count()

17376

In [6]:
(
    df
    .select('year')
    .distinct()
    .show()
)

+----+
|year|
+----+
|2011|
|2012|
+----+



In [10]:
(
    df
    .select('hr')
    .distinct()
    .orderBy('hr')
    .show(25)
)

+---+
| hr|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
| 20|
| 21|
| 22|
| 23|
+---+



In [11]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+------------------+---------+--------------------+-------+-------------------+--------------------+--------------------+--------------------+
|summary|              cnt|              year|             month|              date|                hr|    wkday|             holiday|seasons|       weather_cond|                temp|                 hum|           windspeed|
+-------+-----------------+------------------+------------------+------------------+------------------+---------+--------------------+-------+-------------------+--------------------+--------------------+--------------------+
|  count|            17376|             17376|             17376|             17376|             17376|    17376|               17376|  17376|              17376|               17376|               17376|               17376|
|   mean|189.4829650092081| 2011.502532228361| 6.538731583793738|15.682895948434622|11.546731123

In [None]:
df.describe().show()

In [12]:
(
    df
    .select('holiday')
    .distinct()
    .show()
)

+-------+
|holiday|
+-------+
|      0|
|      1|
+-------+



In [14]:
df.groupBy("holiday").count().show()

+-------+-----+
|holiday|count|
+-------+-----+
|      0|16876|
|      1|  500|
+-------+-----+



In [16]:
df.groupBy("weather_cond").count().show(truncate = False)

+------------------------+-----+
|weather_cond            |count|
+------------------------+-----+
|Light Snow or Light Rain|1419 |
|Clear or Few Cloudy     |11413|
|Mist or Cloudy          |4544 |
+------------------------+-----+



In [17]:
df.groupBy("hr").count().show()

+---+-----+
| hr|count|
+---+-----+
| 19|  728|
|  0|  726|
| 22|  728|
|  7|  727|
|  6|  725|
|  9|  727|
| 17|  730|
|  5|  717|
|  1|  723|
| 10|  727|
|  3|  697|
| 12|  728|
|  8|  727|
| 11|  727|
|  2|  715|
|  4|  697|
| 13|  729|
| 18|  727|
| 14|  729|
| 21|  728|
+---+-----+
only showing top 20 rows



In [19]:
df.groupBy('seasons').count().show()

+-------+-----+
|seasons|count|
+-------+-----+
| winter| 4232|
| summer| 4409|
| spring| 4239|
|   fall| 4496|
+-------+-----+



## Q2

In [18]:
dtrain, dtest = df.randomSplit([0.6, 0.4], seed = 1234)


## Q3-Q4

In [20]:
dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)
dummy_cols_month, ref_category_month = add_dummy_variables('month', 0)
dummy_cols_hr, ref_category_hr = add_dummy_variables('hr', 0)

# Example usage with category_order:
custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)

dummy_cols_holiday, ref_category_holiday = add_dummy_variables('holiday', 0)

# Example usage with category_order:
custom_order_seasons = ['spring', 'summer', 'fall', 'winter']
dummy_cols_seasons, ref_category_seasons = add_dummy_variables('seasons', 0, custom_order_seasons)

dummy_cols_weather_cond, ref_category_weather_cond = add_dummy_variables('weather_cond', 0)


Reference category (dummy omitted): 2011
Reference category (dummy omitted): 1
Reference category (dummy omitted): 0
Reference category (dummy omitted): sunday
Reference category (dummy omitted): 0
Reference category (dummy omitted): spring
Reference category (dummy omitted): Clear or Few Cloudy


In [21]:
dtrain.show()

+---+----+-----+----+---+---------+-------+-------+--------------------+------------------+------------------+------------------+---------+---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------------+------------+-------------+---------------+--------------+------------+--------------+---------+---------+--------------+--------------+------------+--------------+--------------------------------+-------------------------------------+---------------------------+
|cnt|year|month|date| hr|    wkday|holiday|seasons|        weather_cond|              temp|               hum|         windspeed|year_2011|year_2012|month_1|month_2|month_3|month_4|month_5|month_6|month_7|month_8|month_9|month_10|month_11|month_12|hr_0|hr_1|hr_2|hr_3|hr_4|hr_5|hr_6|hr_7|hr_8|hr_9|hr_10|hr_11|hr_12|hr_13|hr_14|hr_15|hr_16|h

In [24]:
conti_cols = ["temp", "hum", "windspeed"]
assembler_predictors = conti_cols + dummy_cols_year + dummy_cols_month + dummy_cols_hr + dummy_cols_wkday + dummy_cols_holiday + dummy_cols_seasons + dummy_cols_weather_cond

assembler = VectorAssembler(
    inputCols = assembler_predictors,
    outputCol = "predictors"
)
dtrain_dum = assembler.transform(dtrain)
dtest_dum  = assembler.transform(dtest)
model_dum = LinearRegression(featuresCol="predictors", labelCol="cnt").fit(dtrain_dum)
dtest_dum = model_dum.transform(dtest_dum) # Q4. adding prediction column to test data

# For model_dum and assembler:
print( regression_table(model_dum, assembler) )

+---------------------------------------------+---------+------+------------+---------+--------------+--------------+
| Metric                                      |   Value | Sig. | Std. Error | p-value | 95% CI Lower | 95% CI Upper |
+---------------------------------------------+---------+------+------------+---------+--------------+--------------+
| Beta: temp                                  |  45.307 | ***  |      1.395 |   0.000 |       42.573 |       48.042 |
| Beta: hum                                   | -17.193 | ***  |      1.095 |   0.000 |      -19.339 |      -15.046 |
| Beta: windspeed                             |  -4.617 | ***  |      2.029 |   0.000 |       -8.595 |       -0.639 |
| Beta: year_2012                             |  85.361 | ***  |      5.129 |   0.000 |       75.308 |       95.414 |
| Beta: month_2                               |   8.508 |  *   |      5.765 |   0.097 |       -2.792 |       19.809 |
| Beta: month_3                               |  18.322 