##  “Are there synergies and tradeoffs in sustainable heating from cleaner stoves and home insulation? Evidence from air pollution control policies in southern Chile” (Ref.: ENEECO-D-24-01813)



###  Table 5

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
# Set datetime string
from datetime import datetime
from sklearn.metrics import pairwise_distances
datetime_string = datetime.now().strftime("%Y%m%d_%H%M%S")

####   Load the dataset

In [2]:
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median

#### Function to extract results

In [3]:
# Function to extract T2ON results
def extract_t2on_results(results, model_name):
    """
    Extract T2ON parameter, standard error, t-stat, p-value, and confidence intervals from PanelOLS results.
    """
    param = results.params["T2ON"]
    std_err = results.std_errors["T2ON"]
    t_stat = results.tstats["T2ON"]
    p_value = results.pvalues["T2ON"]
    ci_lower = param - 1.96 * std_err
    ci_upper = param + 1.96 * std_err

    return {
        "Model": model_name,
        "INSULATION*ON Coefficient": param,
        "Std. Error": std_err,
        "T-stat": t_stat,
        "P-value": p_value,
        "CI Lower": ci_lower,
        "CI Upper": ci_upper,
    }

# Initialize a DataFrame to store results
t2on_results_df = pd.DataFrame()


In [4]:
# Function to extract T1T2ON results
def extract_t1t2on_results(results, model_name):
    """
    Extract T1T2ON parameter, standard error, t-stat, p-value, and confidence intervals from PanelOLS results.
    """
    param = results.params["T1T2ON"]
    std_err = results.std_errors["T1T2ON"]
    t_stat = results.tstats["T1T2ON"]
    p_value = results.pvalues["T1T2ON"]
    ci_lower = param - 1.96 * std_err
    ci_upper = param + 1.96 * std_err

    return {
        "Model": model_name,
        "PELLET*INSULATION*ON Coefficient": param,
        "Std. Error": std_err,
        "T-stat": t_stat,
        "P-value": p_value,
        "CI Lower": ci_lower,
        "CI Upper": ci_upper,
    }

# Initialize a DataFrame to store results
t1t2on_results_df = pd.DataFrame()


 #### Table 5. column 3.  The Effect of Pellet Stoves on PM2.5 Concentration (hourly observations)  - without matching

In [5]:
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

from linearmodels.panel import PanelOLS
# Define independent variables and dependent variable
dependent_var = "log_PMi"
independent_vars = ["ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"]
# Fixed Effects Model
model = PanelOLS.from_formula(
    #f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects + TimeEffects",
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)

# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 1 - Without Matching")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 1 - Without Matching")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)

print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4839
Estimator:                   PanelOLS   R-squared (Between):              0.7620
No. Observations:               17687   R-squared (Within):               0.4839
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7593
Time:                        02:09:02   Log-likelihood                   -2784.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2317.6
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs:                       49.000   F-statistic (robust):             144.51
                            

In [6]:
# Perform nearest neighbor matching
neighbors = 2
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)


# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 2, Nearest Neighbors Matching, n = 2")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 2, Nearest Neighbors Matching, n = 2")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
print(results.summary)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5
                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4865
Estimator:                   PanelOLS   R-squared (Between):              0.7582
No. Observations:               17687   R-squared (Within):               0.4865
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7554
Time:                        02:09:02   Log-likelihood                   -2623.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2341.5
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs

In [7]:
# Perform nearest neighbor matching
neighbors = 3
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)


# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 3, Nearest Neighbors Matching, n = 3")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 3, Nearest Neighbors Matching, n = 3")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
print(results.summary)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5
                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4856
Estimator:                   PanelOLS   R-squared (Between):              0.7549
No. Observations:               17687   R-squared (Within):               0.4856
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7522
Time:                        02:09:02   Log-likelihood                   -2584.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2333.3
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs

In [8]:
# Perform nearest neighbor matching
neighbors = 4
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)


# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 4, Nearest Neighbors Matching, n = 4")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 4, Nearest Neighbors Matching, n = 4")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
print(results.summary)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5
                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4867
Estimator:                   PanelOLS   R-squared (Between):              0.7535
No. Observations:               17687   R-squared (Within):               0.4867
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7508
Time:                        02:09:02   Log-likelihood                   -2668.2
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2343.9
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs

In [9]:
# Perform nearest neighbor matching
neighbors = 5
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)


# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 5, Nearest Neighbors Matching, n = 5")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 5, Nearest Neighbors Matching, n = 5")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
print(results.summary)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5
                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4869
Estimator:                   PanelOLS   R-squared (Between):              0.7564
No. Observations:               17687   R-squared (Within):               0.4869
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7536
Time:                        02:09:03   Log-likelihood                   -2671.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2345.1
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs

In [10]:
# Perform nearest neighbor matching
neighbors = 6
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)


# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 6, Nearest Neighbors Matching, n = 6")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model 6, Nearest Neighbors Matching, n = 6")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
print(results.summary)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5
                          PanelOLS Estimation Summary                           
Dep. Variable:                log_PMi   R-squared:                        0.4874
Estimator:                   PanelOLS   R-squared (Between):              0.7536
No. Observations:               17687   R-squared (Within):               0.4874
Date:                Sun, Dec 08 2024   R-squared (Overall):              0.7508
Time:                        02:09:03   Log-likelihood                   -2683.2
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2349.8
Entities:                         379   P-value                           0.0000
Avg Obs:                       46.668   Distribution:                 F(7,17301)
Min Obs:                       7.0000                                           
Max Obs

In [11]:
t2on_results_df

Unnamed: 0,Model,INSULATION*ON Coefficient,Std. Error,T-stat,P-value,CI Lower,CI Upper
0,model 1 - Without Matching,0.109599,0.033108,3.310386,0.000934,0.044708,0.17449
1,"model 2, Nearest Neighbors Matching, n = 2",0.092023,0.035372,2.601591,0.009287,0.022694,0.161352
2,"model 3, Nearest Neighbors Matching, n = 3",0.093646,0.035034,2.672977,0.007525,0.024979,0.162313
3,"model 4, Nearest Neighbors Matching, n = 4",0.095632,0.034476,2.773893,0.005545,0.02806,0.163205
4,"model 5, Nearest Neighbors Matching, n = 5",0.097205,0.034687,2.802342,0.005079,0.029218,0.165191
5,"model 6, Nearest Neighbors Matching, n = 6",0.096841,0.034693,2.791368,0.005254,0.028843,0.164839


In [12]:
t1t2on_results_df

Unnamed: 0,Model,PELLET*INSULATION*ON Coefficient,Std. Error,T-stat,P-value,CI Lower,CI Upper
0,model 1 - Without Matching,-0.167646,0.035638,-4.704128,3e-06,-0.237496,-0.097795
1,"model 2, Nearest Neighbors Matching, n = 2",-0.167365,0.035715,-4.686112,3e-06,-0.237367,-0.097363
2,"model 3, Nearest Neighbors Matching, n = 3",-0.167322,0.035698,-4.687084,3e-06,-0.237291,-0.097353
3,"model 4, Nearest Neighbors Matching, n = 4",-0.167273,0.035689,-4.687018,3e-06,-0.237223,-0.097323
4,"model 5, Nearest Neighbors Matching, n = 5",-0.167417,0.035683,-4.691743,3e-06,-0.237356,-0.097478
5,"model 6, Nearest Neighbors Matching, n = 6",-0.167243,0.035693,-4.68561,3e-06,-0.237201,-0.097285


In [13]:
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
df = df[variables_to_keep]
# Impute missing values with the median
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    df[col] = df[col].fillna(df[col].median())
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = df[covariates]
y = df["T2"]
# Standardize covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]

# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Perform kernel matching using Gaussian kernel
bandwidth = 0.1  # Adjust bandwidth as needed
distances = pairwise_distances(treated[["propensity_score"]], control[["propensity_score"]], metric="euclidean")
kernel_weights = np.exp(-distances**2 / (2 * bandwidth**2))
# Assign weights for control units
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0  # Full weight for treated units
for i, treated_idx in enumerate(treated.index):
    for j, control_idx in enumerate(control.index):
        df.loc[control_idx, "_weight"] += kernel_weights[i, j]
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
# Save weights to a file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)
# Load the original dataset for panel regression
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])
# Fixed Effects Model without weights
dependent_var = "log_PMi"
independent_vars = ["ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"]

# Fixed Effects Model with weights
model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.1")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.1")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
#print(results.summary)



In [14]:
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
df = df[variables_to_keep]
# Impute missing values with the median
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    df[col] = df[col].fillna(df[col].median())
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = df[covariates]
y = df["T2"]
# Standardize covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]

# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Perform kernel matching using Gaussian kernel
bandwidth = 0.2  # Adjust bandwidth as needed
distances = pairwise_distances(treated[["propensity_score"]], control[["propensity_score"]], metric="euclidean")
kernel_weights = np.exp(-distances**2 / (2 * bandwidth**2))
# Assign weights for control units
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0  # Full weight for treated units
for i, treated_idx in enumerate(treated.index):
    for j, control_idx in enumerate(control.index):
        df.loc[control_idx, "_weight"] += kernel_weights[i, j]
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
# Save weights to a file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)
# Load the original dataset for panel regression
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])
# Fixed Effects Model without weights
dependent_var = "log_PMi"
independent_vars = ["ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"]

# Fixed Effects Model with weights
model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.2")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.2")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
#print(results.summary)


In [15]:
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
df = df[variables_to_keep]
# Impute missing values with the median
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    df[col] = df[col].fillna(df[col].median())
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = df[covariates]
y = df["T2"]
# Standardize covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]

# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Perform kernel matching using Gaussian kernel
bandwidth = 0.3  # Adjust bandwidth as needed
distances = pairwise_distances(treated[["propensity_score"]], control[["propensity_score"]], metric="euclidean")
kernel_weights = np.exp(-distances**2 / (2 * bandwidth**2))
# Assign weights for control units
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0  # Full weight for treated units
for i, treated_idx in enumerate(treated.index):
    for j, control_idx in enumerate(control.index):
        df.loc[control_idx, "_weight"] += kernel_weights[i, j]
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
# Save weights to a file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)
# Load the original dataset for panel regression
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])
# Fixed Effects Model without weights
dependent_var = "log_PMi"
independent_vars = ["ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"]

# Fixed Effects Model with weights
model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T2ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.3")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)

# Extract T1T2ON results for the current model
t1t2on_result = extract_t1t2on_results(results, model_name="model Kernel Matching, bandwidth = 0.3")
t1t2on_results_df = pd.concat([t1t2on_results_df, pd.DataFrame([t1t2on_result])], ignore_index=True)
#print(results.summary)


In [16]:
t2on_results_df = t2on_results_df.drop(index=[1,2, 3, 4, 5])

In [17]:
t1t2on_results_df = t1t2on_results_df.drop(index=[1,2, 3, 4, 5])

In [18]:
t2on_results_df

Unnamed: 0,Model,INSULATION*ON Coefficient,Std. Error,T-stat,P-value,CI Lower,CI Upper
0,model 1 - Without Matching,0.109599,0.033108,3.310386,0.000934,0.044708,0.17449
6,"model Kernel Matching, bandwidth = 0.1",0.103907,0.033623,3.090365,0.002002,0.038006,0.169808
7,"model Kernel Matching, bandwidth = 0.2",0.105536,0.033467,3.15339,0.001617,0.03994,0.171132
8,"model Kernel Matching, bandwidth = 0.3",0.106563,0.033388,3.191654,0.001417,0.041122,0.172003


In [19]:
t1t2on_results_df

Unnamed: 0,Model,PELLET*INSULATION*ON Coefficient,Std. Error,T-stat,P-value,CI Lower,CI Upper
0,model 1 - Without Matching,-0.167646,0.035638,-4.704128,3e-06,-0.237496,-0.097795
6,"model Kernel Matching, bandwidth = 0.1",-0.165751,0.036233,-4.574534,5e-06,-0.236768,-0.094734
7,"model Kernel Matching, bandwidth = 0.2",-0.165818,0.036168,-4.584681,5e-06,-0.236707,-0.094929
8,"model Kernel Matching, bandwidth = 0.3",-0.165908,0.036125,-4.59257,4e-06,-0.236714,-0.095103


In [20]:
t2on_results_df

Unnamed: 0,Model,INSULATION*ON Coefficient,Std. Error,T-stat,P-value,CI Lower,CI Upper
0,model 1 - Without Matching,0.109599,0.033108,3.310386,0.000934,0.044708,0.17449
6,"model Kernel Matching, bandwidth = 0.1",0.103907,0.033623,3.090365,0.002002,0.038006,0.169808
7,"model Kernel Matching, bandwidth = 0.2",0.105536,0.033467,3.15339,0.001617,0.03994,0.171132
8,"model Kernel Matching, bandwidth = 0.3",0.106563,0.033388,3.191654,0.001417,0.041122,0.172003


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
# Set datetime string
from datetime import datetime
from sklearn.metrics import pairwise_distances
datetime_string = datetime.now().strftime("%Y%m%d_%H%M%S")



In [22]:
# Perform nearest neighbor matching
neighbors = 4
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches4 = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches4 = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches4.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches4, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

Optimization terminated successfully.
         Current function value: 0.654594
         Iterations 5


In [23]:
# Function to calculate Standardized Mean Difference (SMD)
def calculate_smd(treated, control, variable):
    mean_treated = treated[variable].mean()
    mean_control = control[variable].mean()
    std_pooled = np.sqrt(
        (treated[variable].var() + control[variable].var()) / 2
    )
    return (mean_treated - mean_control) / std_pooled

# Define covariates for balance analysis
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]

# Separate treated and control groups before matching
treated_before = df[df["T2"] == 1]
control_before = df[df["T2"] == 0]

# Subset matched pairs for after matching
matched_control_indices = matched_pairs["control_index"].values
control_after = control_before.loc[matched_control_indices]
treated_after = treated_before

# Calculate SMD before and after matching
smd_results = []
for cov in covariates:
    smd_before = calculate_smd(treated_before, control_before, cov)
    smd_after = calculate_smd(treated_after, control_after, cov)
    smd_results.append({"Covariate": cov, "SMD Before": smd_before, "SMD After": smd_after})

# Convert to DataFrame for tabular output
smd_df = pd.DataFrame(smd_results)


In [25]:
# Display the balance table
print(smd_df)

             Covariate  SMD Before  SMD After
0            Education   -0.038377   0.013572
1    GenderHHead1women    0.146895   0.021193
2                  Age    0.150600   0.038270
3  DwellingType1single   -0.431322  -0.086573
4                NSize   -0.223597  -0.062134
5     Anypersonolder60   -0.018375  -0.007768
6  Perc_concrete_built   -0.108438   0.048322


In [None]:
HOLA HASTA ACA GOOD

In [None]:
# Perform nearest neighbor matching
neighbors = 4
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T1ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 2, Nearest Neighbors Matching, n = 4")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)
print(results.summary)

#### Perform nearest neighbor matching (neighbors = 3) 

In [None]:
# Perform nearest neighbor matching
neighbors = 3
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T1ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 3, Nearest Neighbors Matching, n = 3")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)
print(results.summary)

#### Perform nearest neighbor matching (neighbors = 5) 

In [None]:
# Perform nearest neighbor matching
neighbors = 5
# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
# Filter data
df = df[df['tper'] < 2]
# Define the variables to keep
variables_to_keep = ["ID","T2","Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
# Filter the DataFrame to keep only these columns
df = df[variables_to_keep]

# Impute missing values with the median of each column
columns_to_impute = ["Education", "GenderHHead1women", "Age", "DwellingType1single"]
for col in columns_to_impute:
    median_value = df[col].median()  # Calculate the median
    df[col] = df[col].fillna(median_value)  # Replace missing values with the median
# Logistic regression to estimate propensity scores
covariates = ["Education", "GenderHHead1women", "Age", "DwellingType1single", "NSize", "Anypersonolder60", "Perc_concrete_built"]
X = sm.add_constant(df[covariates])
y = df["T2"]
logit_model = sm.Logit(y, X).fit()
df["logoddsT2"] = logit_model.predict(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logit_model = LogisticRegression(max_iter=500)
df["propensity_score"] = logit_model.fit(X_scaled, y).predict_proba(X_scaled)[:, 1]
# Separate treated and control groups
treated = df[df["T2"] == 1]
control = df[df["T2"] == 0]

# Apply caliper (0.01)
caliper = 0.01
matches = []
nn = NearestNeighbors(n_neighbors=neighbors, metric="euclidean")
nn.fit(control[["propensity_score"]])
distances, indices = nn.kneighbors(treated[["propensity_score"]])
matches = []
for i, dists in enumerate(distances):
    matched_indices = indices[i][dists <= caliper]
    if len(matched_indices) > 0:
        for control_index in matched_indices:
            matches.append((treated.index[i], control.iloc[control_index].name))
# Generate matched DataFrame
matched_pairs = pd.DataFrame(matches, columns=["treated_index", "control_index"])
# Assign weights
df["_weight"] = 0.0
df.loc[treated.index, "_weight"] = 1.0
for treated_idx in matched_pairs["treated_index"].unique():
    controls = matched_pairs[matched_pairs["treated_index"] == treated_idx]["control_index"]
    df.loc[controls, "_weight"] += 1 / len(controls)
df["weight"] = df["_weight"]
# Replace zero weights with a small positive value
df["weight"] = df["weight"].apply(lambda x: x if x > 0 else 1e-08)
#df["weight"] = df["weight"] / df["weight"].mean()
# Save weights to file
weights_path = f"Weights_{datetime_string}.csv"
df[["ID", "weight"]].to_csv(weights_path, index=False)

# Load the dataset
file_path = "BDTemuco.dta"  # Update with the correct path
df = pd.read_stata(file_path)
relevant_vars = ["ID","IDHour","T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle","ON", "log_PMi"]
# Filter the DataFrame to keep only these columns
df = df[relevant_vars]
df= df.dropna(subset=["log_PMi","ON", "T2ON","T1T2ON", "log_PMo", "log_To", "DayExp", "Cycle"])
# Merge weights into original data
weights_df = pd.read_csv(weights_path)
df = pd.merge(df, weights_df, on="ID", how="left")
# Save merged dataset
panel_data_path = f"PanelBDTemuco_{datetime_string}.csv"
df.to_csv(panel_data_path, index=False)
# Prepare for panel regression
df = df.sort_values(by=["ID", "IDHour"])
df = df.set_index(["ID", "IDHour"])

model = PanelOLS.from_formula(
    f"{dependent_var} ~ { ' + '.join(independent_vars)} + EntityEffects",
    data=df,
    weights=df["weight"],
    drop_absorbed=True
)
results = model.fit(cov_type="clustered", cluster_entity=True)
# Extract T1ON results for the current model
t2on_result = extract_t2on_results(results, model_name="model 4, Nearest Neighbors Matching, n = 5")
t2on_results_df = pd.concat([t2on_results_df, pd.DataFrame([t2on_result])], ignore_index=True)
print(results.summary)
