# A08 Bootstrapping

### 1. Regresión lineal simple

In [127]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [129]:
adv = pd.read_csv("Advertising.csv") 
adv.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [131]:
X_adv = adv[['TV', 'radio', 'newspaper']]   
X_adv = sm.add_constant(X_adv)             
y_adv = adv['sales']

In [133]:
ols_adv = sm.OLS(y_adv, X_adv).fit()
print(ols_adv.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Thu, 20 Nov 2025   Prob (F-statistic):           1.58e-96
Time:                        17:42:24   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.0

In [135]:
beta_ols_adv = ols_adv.params      
se_ols_adv   = ols_adv.bse         
print(beta_ols_adv)
print(se_ols_adv)

const        2.938889
TV           0.045765
radio        0.188530
newspaper   -0.001037
dtype: float64
const        0.311908
TV           0.001395
radio        0.008611
newspaper    0.005871
dtype: float64


### 2. Regresión logística simple

In [138]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

default = pd.read_csv("Default.csv")
default.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895
3,No,No,529.250605,35704.49394
4,No,No,785.655883,38463.49588


In [140]:
default["default_bin"] = (default["default"] == "Yes").astype(int)
default["student_bin"] = (default["student"] == "Yes").astype(int)
X_def = default[["student_bin", "balance", "income"]]
X_def = sm.add_constant(X_def)

y_def = default["default_bin"]

logit_def = sm.Logit(y_def, X_def).fit()
print(logit_def.summary())

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_bin   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9996
Method:                           MLE   Df Model:                            3
Date:                Thu, 20 Nov 2025   Pseudo R-squ.:                  0.4619
Time:                        17:42:26   Log-Likelihood:                -785.77
converged:                       True   LL-Null:                       -1460.3
Covariance Type:            nonrobust   LLR p-value:                3.257e-292
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -10.8690      0.492    -22.079      0.000     -11.834      -9.904
student_bin    -0.6468   

In [142]:
beta_logit_def = logit_def.params
se_logit_def   = logit_def.bse

print(beta_logit_def)
print(se_logit_def)

const         -10.869045
student_bin    -0.646776
balance         0.005737
income          0.000003
dtype: float64
const          0.492273
student_bin    0.236257
balance        0.000232
income         0.000008
dtype: float64


### Bootstrap para Advertising

In [145]:
n_boot = 1000
n_adv = adv.shape[0]

boot_betas_adv = np.zeros((n_boot, 4))

rng = np.random.default_rng(123)

for b in range(n_boot):
    idx = rng.integers(0, n_adv, n_adv)
    sample = adv.iloc[idx]

    X_b = sm.add_constant(sample[['TV', 'radio', 'newspaper']])
    y_b = sample['sales']   

    model_b = sm.OLS(y_b, X_b).fit()
    boot_betas_adv[b, :] = model_b.params.values

boot_mean_adv = boot_betas_adv.mean(axis=0)
boot_se_adv   = boot_betas_adv.std(axis=0, ddof=1)

print("Media coeficientes Advertising bootstrap:", boot_mean_adv)
print("Desviación estándar coeficientes Advertising bootstrap:", boot_se_adv)

Media coeficientes Advertising bootstrap: [ 2.94664138e+00  4.56559526e-02  1.89032304e-01 -9.31146995e-04]
Desviación estándar coeficientes Advertising bootstrap: [0.32677421 0.00190832 0.01067933 0.0064047 ]


### Bootstrap para Default

In [147]:
n_boot = 1000
n_def  = default.shape[0]
boot_betas_def = np.zeros((n_boot, 4))

rng = np.random.default_rng(123)

for b in range(n_boot):
    idx = rng.integers(0, n_def, n_def)   
    sample = default.iloc[idx]

    X_b = sample[["student_bin", "balance", "income"]]
    X_b = sm.add_constant(X_b)
    
    y_b = sample["default_bin"]

    try:
        model_b = sm.Logit(y_b, X_b).fit(disp=0)
        boot_betas_def[b, :] = model_b.params.values
    except:
        continue

boot_mean_def = boot_betas_def.mean(axis=0)
boot_se_def   = boot_betas_def.std(axis=0, ddof=1)

print("Media coeficientes Default bootstrap:")
print(boot_mean_def)

print("Desviación estándar coeficientes Default bootstrap:")
print(boot_se_def)

Media coeficientes Default bootstrap:
[-1.09133623e+01 -6.44331727e-01  5.76067369e-03  2.97711275e-06]
Desviación estándar coeficientes Default bootstrap:
[5.02920147e-01 2.35825921e-01 2.38567923e-04 8.21123444e-06]


### Tabla comparativa método teórico vs bootstrap

In [149]:
import pandas as pd

# Construir tabla comparativa
tabla = pd.DataFrame({
    "Modelo": [
        "Advertising",
        "Advertising",
        "Advertising",
        "Advertising",
        "Default",
        "Default",
        "Default",
        "Default"
    ],
    "Tipo": [
        "Coef OLS", 
        "SE OLS",
        "Media Bootstrap",
        "SE Bootstrap",
        "Coef Logit",
        "SE Logit",
        "Media Bootstrap",
        "SE Bootstrap"
    ],
    "const": [
        beta_ols_adv.values[0],
        se_ols_adv.values[0],
        boot_mean_adv[0],
        boot_se_adv[0],
        beta_logit_def.values[0],
        se_logit_def.values[0],
        boot_mean_def[0],
        boot_se_def[0]
    ],
    "v1": [
        beta_ols_adv.values[1],
        se_ols_adv.values[1],
        boot_mean_adv[1],
        boot_se_adv[1],
        beta_logit_def.values[1],
        se_logit_def.values[1],
        boot_mean_def[1],
        boot_se_def[1]
    ],
    "v2": [
        beta_ols_adv.values[2],
        se_ols_adv.values[2],
        boot_mean_adv[2],
        boot_se_adv[2],
        beta_logit_def.values[2],
        se_logit_def.values[2],
        boot_mean_def[2],
        boot_se_def[2]
    ],
    "v3": [
        beta_ols_adv.values[3],
        se_ols_adv.values[3],
        boot_mean_adv[3],
        boot_se_adv[3],
        beta_logit_def.values[3],
        se_logit_def.values[3],
        boot_mean_def[3],
        boot_se_def[3]
    ]
})

tabla

Unnamed: 0,Modelo,Tipo,const,v1,v2,v3
0,Advertising,Coef OLS,2.938889,0.045765,0.18853,-0.001037
1,Advertising,SE OLS,0.311908,0.001395,0.008611,0.005871
2,Advertising,Media Bootstrap,2.946641,0.045656,0.189032,-0.000931
3,Advertising,SE Bootstrap,0.326774,0.001908,0.010679,0.006405
4,Default,Coef Logit,-10.869045,-0.646776,0.005737,3e-06
5,Default,SE Logit,0.492273,0.236257,0.000232,8e-06
6,Default,Media Bootstrap,-10.913362,-0.644332,0.005761,3e-06
7,Default,SE Bootstrap,0.50292,0.235826,0.000239,8e-06


### L2 + Advertising + bootstrap

In [151]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import KFold

X_adv_np = adv[['TV', 'radio', 'newspaper']].values
y_adv_np = adv['sales'].values

alphas = np.logspace(-4, 4, 50)

cv = KFold(n_splits=5, shuffle=True, random_state=123)

ridge_cv = RidgeCV(alphas=alphas, cv=cv, store_cv_values=False)
ridge_cv.fit(X_adv_np, y_adv_np)

best_alpha = ridge_cv.alpha_
print("Mejor alpha (Ridge, Advertising):", best_alpha)

ridge_final = Ridge(alpha=best_alpha)
ridge_final.fit(X_adv_np, y_adv_np)

beta0_ridge = ridge_final.intercept_
beta1_ridge = ridge_final.coef_[0]
print("Coeficientes Ridge Advertising (final):", beta0_ridge, beta1_ridge)



Mejor alpha (Ridge, Advertising): 159.98587196060572
Coeficientes Ridge Advertising (final): 2.951339866002259 0.04576434181258043


In [152]:
n_boot = 1000
n_adv  = adv.shape[0]

boot_betas_ridge = np.zeros((n_boot, 4))

rng = np.random.default_rng(123)

for b in range(n_boot):
    idx = rng.integers(0, n_adv, n_adv)
    sample = adv.iloc[idx]

    X_b = sample[['TV', 'radio', 'newspaper']].values
    y_b = sample['sales'].values

    model_b = Ridge(alpha=best_alpha)
    model_b.fit(X_b, y_b)

    boot_betas_ridge[b, 0] = model_b.intercept_
    boot_betas_ridge[b, 1:] = model_b.coef_

boot_mean_ridge = boot_betas_ridge.mean(axis=0)
boot_se_ridge   = boot_betas_ridge.std(axis=0, ddof=1)

print("Media coeficientes Ridge (bootstrap):", boot_mean_ridge)
print("Desviación estándar coeficientes Ridge (bootstrap):", boot_se_ridge)

Media coeficientes Ridge (bootstrap): [ 2.95927679e+00  4.56558544e-02  1.88230659e-01 -7.35261920e-04]
Desviación estándar coeficientes Ridge (bootstrap): [0.32576627 0.00190708 0.01063885 0.00639232]
