<h1>1 Auxiliary Methods</h1>

In [13]:
def ndarray_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.Series):
        return obj.tolist()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

In [14]:
def measure_time(func, *args, **kwargs):
    """Utility function to measure time taken by a function."""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

<h1>2 Data Preparation</h1>

In [15]:
# create datasets for all possible combinations of control variables
def create_all_Xs(df):
    Xs = []
    for column in df.columns:
        X = df.drop(column, axis=1)
        Xs.append(X)
    return Xs

In [16]:
# This method adds environment variables as controls
# Data preparation
def prepare_controls(df_energy, df_env):
    """
    Prepares control variables for each combination of outcome and treatment.
    
    Args:
        df_energy (DataFrame): DataFrame of treatment variables.
        df_env (DataFrame): DataFrame of outcome variables.
    
    Returns:
        dict: Nested dictionary of control variables for each outcome and treatment.
    """
    
    # Prepare concatenated controls outside the loop
    combined_controls = pd.concat([df_env, df_energy], axis=1)
    
    all_controls = {}
    for outcome_name in df_env.columns:
        controls_for_outcome = {}    
        for (treatment_name, T) in df_energy.iteritems():
            # Select only the columns that are not the current outcome or treatment
            X = combined_controls.drop([outcome_name, treatment_name], axis=1, errors='ignore')
            
            # Check for duplicated column names
            if X.columns.duplicated().any():
                raise ValueError(f"Duplicated column names detected when preparing controls for outcome {outcome_name} and treatment {treatment_name}.")
            
            controls_for_outcome[treatment_name] = X
        all_controls[outcome_name] = controls_for_outcome
        
    return all_controls

<h1>3 Model Training methods</h1>

In [None]:
def train_model(Y, T, X, est_class, est_kwargs, verbose=False):
    """Train a single model and return it."""
    est_instance = est_class(**est_kwargs)
    try:
        # Check if the estimator class is DMLOrthoForest
        if 'DMLOrthoForest' in str(est_class):
            est_instance.fit(Y, T, X=X)
        else:
            est_instance.fit(Y, T, X=X, cache_values=True)
    except Exception as e:
        if verbose:
            print(f"Error training model. Error: {e}")
        return None
    return est_instance

In [5]:
def find_causality(df_energy, df_env, est_class, est_kwargs={}, alpha=0.05, verbose=False):
    """
    Trains a provided estimator on multiple treatments and outcomes, 
    recording effects and coefficients.
    
    Args:
        df_energy (DataFrame): DataFrame of treatment variables.
        df_env (DataFrame): DataFrame of outcome variables.
        est_class (class): The class of the estimator to be instan tiated for causal inference.
        est_kwargs (dict, optional): Keyword arguments to be passed to the estimator's constructor. Defaults to {}.
        alpha (float, optional): The significance level for confidence intervals. Defaults to 0.05.
        verbose (bool, optional): Whether to print progress updates. Defaults to False.
    
    Returns:
        dict: Nested dictionary of model estimators.
    """
    
    Xs = create_all_Xs(df_energy)
    environmental_models = {}
    
    for (outcome_name, Y) in df_env.items():
        if verbose: print("Outcome: " + outcome_name)
        fitted_models = {}
        
        for X, (treatment_name, T) in zip(Xs, df_energy.items()):
            if verbose: print("\tTreatment: " + treatment_name)
            
            est_instance, training_time = measure_time(train_model, Y, T, X, est_class, est_kwargs, verbose)
            
            if est_instance is not None:
                fitted_models[treatment_name] = est_instance
                if verbose: print(f"\tTraining time: {training_time} seconds")
                
        environmental_models[outcome_name] = fitted_models
        
    return environmental_models

<h1>4 Methods for persistence</h1>

In [19]:
def persist_estimators(env_models, dataset_dir_name, json_file_name, parent_dir_path='/home/nissatech/Documents/Double Machine Learning/Models'):
    """
    Persist models and their metadata to disk.
    
    Parameters:
    - env_models: Dictionary containing models.
    - dataset_dir_name: Name of the dataset-specific directory.
    - json_file_name: Name of the JSON file to save metadata.
    - parent_dir_path: Path to the parent directory. Default is a hardcoded path.
    
    Returns:
    None
    """
    
    dataset_dir_path = os.path.join(parent_dir_path, dataset_dir_name)
    json_file_path = os.path.join(dataset_dir_path, json_file_name)
    models_dir_path = os.path.join(dataset_dir_path, "models")
    
    # Create directories if they don't exist
    for dir_path in [parent_dir_path, dataset_dir_path, models_dir_path]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # New dictionary to store model paths
    model_paths_dict = {}

    # Replace LinearDML objects with paths and save the models
    for key, value in env_models.items():
        model_paths_dict[key] = {}
        if isinstance(value, dict):
            for subkey, subvalue in value.items():
                if isinstance(subvalue, econml.dml.dml.LinearDML):
                    model_path = os.path.join(models_dir_path, f"{key}_{subkey}.pkl")
                    joblib.dump(subvalue, model_path)
                    model_paths_dict[key][subkey] = model_path

    # Serialize the dictionary to JSON
    with open(json_file_path, 'w') as f:
        json.dump(model_paths_dict, f)

In [None]:
def load_estimators(path):
    """
    Load the estimators from the paths saved in the JSON file.
    
    Args:
    - path (str): Path to the JSON file where the dictionary with estimator paths is saved.
    
    Returns:
    - dict: Dictionary with the same structure as the original, but with LinearDML objects loaded from disk.
    """
    # Load the dictionary from the JSON file
    with open(path, 'r') as f:
        env_models = json.load(f)
    
    # Iterate over the dictionary to load the LinearDML objects
    for outer_key, inner_dict in env_models.items():
        for inner_key, value in inner_dict.items():
            if isinstance(value, str) and value.endswith('.pkl'):
                # If the value is a path to a .pkl file, load the LinearDML object from this path
                model = joblib.load(value)
                # Replace the path with the loaded LinearDML object
                inner_dict[inner_key] = model
    
    return env_models

<h1>5 Visualisation methods</h1>

<h2>5.1 Treatment vs Outcome</h2>

In [None]:
def scatter_all_treatments_vs_all_outcomes(df_treatments, df_outcomes):
    for outcome_label, outcome_values in df_outcomes.items():
        for treatment_label, treatment_values in df_treatments.items():
            scatter_treatment_vs_outcome(treatment_values, outcome_values, treatment_label, outcome_label)

In [19]:
def scatter_all_treatments_vs_single_outcome(df_treatments, outcome_values, outcome_label):
    for treatment_label, treatment_values in df_treatments.items():
        scatter_treatment_vs_outcome(treatment_values, outcome_values, treatment_label, outcome_label)

In [18]:
def scatter_treatment_vs_outcome(treatment_values, outcome_values, treatment_label, outcome_label):
    
    plt.figure(figsize=(10, 6))  # Optional: Set the figure size
    plt.scatter(treatment_values, outcome_values, alpha=0.5)
    
    plt.title("Treatment: " + treatment_label + " vs Outcome: " + outcome_label)
    plt.xlabel(treatment_label)
    plt.ylabel(outcome_label)
    
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    
    plt.show()

<h2>5.2 CATE value</h2>

In [21]:
def visualize_models(environmental_models):
    
    for env_parameter_name, env_models in environmental_models.items():
        for energy_parameter_name, model in env_models.items():
    
            # Create a histogram of the CATE estimates - Conditional Average Treatment Effect
            fig = go.Figure(data=[go.Histogram(x=model['pnt_effect'], nbinsx=30)])
            
            fig.update_layout(
                title_text='Distribution of Estimated CATE for Feature: {}, Outcome: {}'.format(energy_parameter_name, env_parameter_name), 
                xaxis_title_text='Estimated CATE', 
                yaxis_title_text='Frequency', 
            )
            
            fig.show()

<h2>5.3 Visualize MSE</h2>

In [None]:
def visualize_mse(model_type, models_dict, title_prefix=""):
    """
    Visualizes the Mean Squared Error (MSE) for each model based on their residuals.

    Args:
    - models_dict (dict): Dictionary where the key is the model name, and the value is the model with a `residuals_` attribute.
    - title_prefix (str): Prefix to be added to the chart title.

    Returns:
    None
    """

    if(model_type == 'outcome'):
        num = 0 
    elif(model_type == 'treatment'):
        num = 1
    
    # Calculate MSE for each model
    mse_values = [np.mean(model.residuals_[num]**2) for model in models_dict.values()]
    model_names = list(models_dict.keys())

    # Visualization
    plt.figure(figsize=(10, 6))
    plt.bar(model_names, mse_values, color='skyblue')
    plt.ylabel('Mean Squared Error (MSE)')
    plt.title(f'Mean Squared Error for Each Model ({title_prefix})')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

<h2>5.4 True vs Predicted</h2>

In [8]:
def true_vs_predicted_treatment_model(df_energy, env_models, outcome_label):
    for energy_parameter_name, model in env_models.items():
        T = df_energy[energy_parameter_name]
        predicted_T = T + model.residuals_[1]
        visualize_True_vs_Predicted('treatment', T, predicted_T, energy_parameter_name, outcome_label)

In [9]:
def true_vs_predicted_outcome_model(df_env, env_models, outcome_label):
    Y = df_env[outcome_label]
    for energy_parameter_name, model in env_models.items():
        predicted_y = Y + model.residuals_[0]
        visualize_True_vs_Predicted('output', Y, predicted_y, energy_parameter_name, outcome_label)

In [None]:
def visualize_true_vs_predicted(model_type, true_values, pred_values, treatment_label, outcome_label):
    """
    Visualize the true values against the predicted values for regression models.
    
    Parameters:
    - model_type (str): Type of the model, either 'outcome' or 'treatment'.
    - true_values (array-like): The true values of the variable.
    - pred_values (array-like): The predicted values of the variable.
    - treatment_label (str): The label for the treatment variable.
    - outcome_label (str): The label for the outcome variable.
    """
    
    plt.title(f"Treatment: {treatment_label}, Outcome: {outcome_label}")
    
    if model_type == 'outcome':
        xlabel = outcome_label
        ylabel = "Predicted " + outcome_label
    elif model_type == 'treatment':
        xlabel = treatment_label
        ylabel = "Predicted " + treatment_label
    else:
        raise ValueError("model_type must be either 'outcome' or 'treatment'")
    
    plt.xlabel(f"True {xlabel}")
    plt.ylabel(ylabel)
    
    plt.scatter(true_values, pred_values, alpha=0.5)
    plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], 'r--')  # 45-degree line
    plt.show()

<h2>5.5 Actual and Predicted values through Time</h2>

In [14]:
def true_and_predicted_vs_time_treatment_model(df_energy, env_models, outcome_label):
    for energy_parameter_name, model in env_models.items():
        T = df_energy[energy_parameter_name]
        predicted_T = T + model.residuals_[1]
        visualize_true_and_pred_vs_time('treatment', T, predicted_T, energy_parameter_name, outcome_label)

In [15]:
def true_and_predicted_vs_time_outcome_model(df_env, env_models, outcome_label):
    Y = df_env[outcome_label]
    for energy_parameter_name, model in env_models.items():
        predicted_y = Y + model.residuals_[0]
        visualize_true_and_pred_vs_time('outcome', Y, predicted_y, energy_parameter_name, outcome_label)

In [16]:
def visualize_true_and_pred_vs_time(model_type, true_values, pred_values, treatment_label, outcome_label):
    """
    Visualize the time series of true and predicted values on the same plot.
    
    Parameters:
    - true_values (pandas.Series or list): A series or list containing the true values over time.
    - pred_values (pandas.Series or list): A series or list containing the predicted values over time.
    - label (str): A label for the data being visualized, used in the plot title.
    
    Returns:
    - None: This function displays a plot.
    """

    fig, ax = plt.subplots(figsize=(15, 7))
    
    # Graph for predicted values
    ax.plot(pred_values, color='salmon', linestyle='--', label='Predicted values', linewidth=2)
    
    # Graph for true values
    ax.plot(true_values, color='royalblue', label='True Values', linewidth=2)
    
    if(model_type == 'outcome'):
        ax.set_title("Time Series of True and Predicted " + outcome_label + ", treatment is " + treatment_label)
    elif(model_type == 'treatment'):
        ax.set_title("Time Series of True and Predicted " + treatment_label + ", outcome is " + outcome_label)
    
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    pass

<h2>5.6 Plot Residuals</h2>

In [13]:
def plot_residuals(estimators):
    
    for env_parameter_name, env_models in estimators.items():
        for energy_parameter_name, model in env_models.items():
            plt.figure(figsize=(10, 6))
            plt.scatter(model.residuals_[1], model.residuals_[0], alpha=0.5)
            plt.xlabel(f'Residual Treatment for {energy_parameter_name}')
            plt.ylabel(f'Residual Outcome for {env_parameter_name}')
            plt.title(f'Residuals from DML: {env_parameter_name} vs. {energy_parameter_name}')
            plt.show()

<h2>5.7 Second Stage Prediction</h2>

In [22]:
def visualize_predicted_vs_actual(model, X, T, Y, df_energy_3, outcome_name, treatment_label):
    """
    Visualize the predicted vs actual energy consumption.
    
    Parameters:
    - model: The trained model.
    - X: The features.
    - T: The treatment variable.
    - Y: The actual outcome values.
    - df_energy_3: DataFrame containing the energy data with a datetime index.
    
    Returns:
    - mae: Mean Absolute Error between the predicted and actual values.
    """
    from sklearn.metrics import mean_absolute_error
    import matplotlib.pyplot as plt
    
    # Predict the causal effect for the test set
    causal_effects = model.effect(X)

    # Adjust the features for computing residuals
    X_train_combined = pd.concat([T, X], axis=1)

    # Predict the outcome based on the causal effect
    predicted_consumption = Y - np.mean(model.residuals_[0]) + causal_effects

    # Convert predicted_consumption to a 1D array if it's not
    if hasattr(predicted_consumption, 'values'):
        predicted_consumption = predicted_consumption.values.ravel()
    else:
        predicted_consumption = np.array(predicted_consumption).ravel()

    # Convert Y to a 1D array if it's not
    if hasattr(Y, 'values'):
        Y_values = Y.values.ravel()
    else:
        Y_values = np.array(Y).ravel()

    mae = mean_absolute_error(Y_values, predicted_consumption)
    print(f"Mean Absolute Error on Test Set: {mae:.2f}")

    # Convert df_energy_3.index to a 1D array if it's not
    if hasattr(df_energy_3.index, 'values'):
        index_values = df_energy_3.index.values.ravel()
    else:
        index_values = np.array(df_energy_3.index).ravel()

    plt.figure(figsize=(12, 6))
    plt.plot(index_values, predicted_consumption, label="Predicted Effect of", color='red', linestyle='--')
    plt.plot(index_values, Y_values, label="Actual Effect", color='blue', linewidth=2)
    plt.title("Predicted Effect of: " + treatment_label + " on: " + outcome_name)
    plt.xlabel("Date")
    plt.ylabel("Effect")
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return mae

In [None]:
def visualize_predicted_vs_actual_for_env(outcome_label, env_models, df_energy, df_env):
    Y = df_env[outcome_label]
    
    for energy_parameter_name, model in env_models.items():
        X = df_energy.drop(energy_parameter_name, axis=1)
        T = df_energy[energy_parameter_name]
        visualize_predicted_vs_actual(model, X, T, Y, df_energy_3_normalized, outcome_label, energy_parameter_name)