In [1]:
import pandas as pd
import numpy as np

In [2]:
def calculate_correlation_and_covariance(data, selected_columns):
    """
    This function calculates and returns the correlation and covariance matrices for the given data.

    Args:
        data (pd.DataFrame): A pandas DataFrame containing the input data.
        selected_columns (list): A list of column names to calculate the correlation and covariance matrices.

    Returns:
        pd.DataFrame, pd.DataFrame: The correlation and covariance matrices as pandas DataFrames.
    """

    # Create an empty dictionary to store the results
    correlation_results = {}
    covariance_results = {}

    # Calculate the correlation and covariance matrices for the selected columns
    # Iterate over each unique pair of columns
    for col1 in selected_columns:
        for col2 in selected_columns:
            # If the pair consists of the same column
            if col1 == col2:
                # The correlation is 1 and the covariance is the variance
                correlation_results[(col1, col2)] = 1
                covariance_results[(col1, col2)] = np.var(data[col1])
            else:
                # Compute the correlation and covariance for distinct columns
                correlation = np.corrcoef(data[col1], data[col2])[0][1]
                covariance = np.cov(data[col1], data[col2])[0][1]
                # Store these values in the respective dictionaries
                correlation_results[(col1, col2)] = correlation
                covariance_results[(col1, col2)] = covariance

    # Convert the results to DataFrames for easier visualization and manipulation
    # The unstack() method is used to reshape the DataFrame
    correlation_df = pd.DataFrame(correlation_results, index=[0]).T.unstack()
    correlation_df.columns = selected_columns
    correlation_df.index = selected_columns

    covariance_df = pd.DataFrame(covariance_results, index=[0]).T.unstack()
    covariance_df.columns = selected_columns
    covariance_df.index = selected_columns

    # Return the correlation and covariance matrices as pandas DataFrames
    return correlation_df, covariance_df

In [None]:
# Load your data
data = pd.read_csv('../data/refactoring/dummy_data.csv')

# Select the columns for which you want to calculate the correlation and covariance matrices
selected_columns = ['Age', 'Income', 'Spending']

# Call the function with the loaded data and selected columns
correlation_matrix, covariance_matrix = calculate_correlation_and_covariance(data, selected_columns)

# Print the correlation and covariance matrices
print("Correlation Matrix:")
print(correlation_matrix)
print("\nCovariance Matrix:")
print(covariance_matrix)