In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from scipy.stats import pearsonr

# Ensure no limitations in Jupyter Notebook output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from scipy.stats import pearsonr

def perform_univariate_analysis(data, target, expected_signs=None, 
                                model_types=["ols", "glm"], separate_tables=False):
    """
    Perform univariate analysis on macro factors using OLS and/or GLM (with a logit link).

    For GLM analysis the target is scaled to the unit interval if necessary.
    Additionally, for each statistical test outcome a verdict field is added indicating whether
    the test’s hypothesis was confirmed or failed (using a 0.05 significance threshold).

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame whose first column is a date column and subsequent columns are macro factors.
    target : pd.Series or str
        A continuous target variable. If a string is provided then that column is extracted from data.
    expected_signs : dict, optional
        A dictionary mapping factor names to expected sign ('positive' or 'negative').
    model_types : list of str, optional
        Which models to run. Options are "ols" and/or "glm". Default is both.
    separate_tables : bool, optional
        If True, return a dict with separate DataFrames for each model type.
        Otherwise, return a single combined DataFrame (merging OLS and GLM on Factor).

    Returns
    -------
    results : pd.DataFrame or dict of pd.DataFrame
        If separate_tables is False, a single merged DataFrame with one row per factor.
        Otherwise, a dictionary with keys "OLS" and "GLM" mapping to their respective DataFrames.
    """
    if expected_signs is None:
        expected_signs = {}
        
    # 1. Data Preparation: Extract target and align indices.
    if isinstance(target, str):
        if target in data.columns:
            y = data[target]
            factor_data = data.drop(columns=[target])
        else:
            raise ValueError("Target column name not found in data.")
    elif isinstance(target, pd.Series):
        y = target.copy()
        factor_data = data.copy()
    else:
        raise ValueError("Target must be either a column name (str) or a Pandas Series.")
    
    # Reassign target index from the date column if possible.
    if isinstance(y, pd.Series) and (len(y) == len(data)):
        date_col = data.columns[0]
        try:
            y.index = pd.to_datetime(data[date_col])
        except Exception as e:
            print(f"Could not convert first column to datetime: {e}")
    
    # Assume first column is a date column; set it as index.
    date_col = factor_data.columns[0]
    if (factor_data[date_col].dtype == 'datetime64[ns]') or (date_col.lower() == 'date'):
        factor_data = factor_data.set_index(date_col)
    
    # Align indices.
    common_index = factor_data.index.intersection(y.index)
    factor_data = factor_data.loc[common_index]
    y = y.loc[common_index]
    
    # 2. For GLM: Scale target to [0,1] if needed.
    if "glm" in model_types:
        if (y.min() < 0) or (y.max() > 1):
            y_glm = (y - y.min()) / (y.max() - y.min())
        else:
            y_glm = y.copy()
    
    # 3. Loop over each factor and run analyses.
    ols_results = []
    glm_results = []
    
    for factor in factor_data.columns:
        # Skip non-numeric columns.
        if not np.issubdtype(factor_data[factor].dtype, np.number):
            continue
        
        # Compute common descriptive stats and ADF test on the factor.
        factor_series = factor_data[factor].dropna()
        common_stats = {}
        try:
            common_stats["Mean"]     = factor_series.mean()
            common_stats["Std"]      = factor_series.std()
            common_stats["Min"]      = factor_series.min()
            common_stats["Max"]      = factor_series.max()
            common_stats["Skewness"] = factor_series.skew()
            common_stats["Kurtosis"] = factor_series.kurtosis()
            adf_result = adfuller(factor_series)
            common_stats["ADF Test Statistic"] = adf_result[0]
            common_stats["ADF p-Value"]          = adf_result[1]
            common_stats["ADF Lags"]             = adf_result[2]
            if not np.isnan(adf_result[1]):
                # Null hypothesis: non-stationarity. p < 0.05 → stationary.
                common_stats["ADF Test Outcome"] = ("H0 rejected (Stationary)" 
                                                    if adf_result[1] < 0.05 else "H0 not rejected (Non-stationary)")
            else:
                common_stats["ADF Test Outcome"] = "NA"
        except Exception as e:
            for key in ["Mean", "Std", "Min", "Max", "Skewness", "Kurtosis", 
                        "ADF Test Statistic", "ADF p-Value", "ADF Lags", "ADF Test Outcome"]:
                common_stats[key] = np.nan
            print(f"Error computing common stats for factor {factor}: {e}")
        
        # --- A. OLS Analysis (Original Target) ---
        if "ols" in model_types:
            df_temp = pd.DataFrame({
                "x": factor_data[factor],
                "y": y
            }).dropna()
            
            if df_temp["x"].nunique() < 2 or df_temp.empty:
                ols_dict = {"Factor": factor}
                for field in ["OLS Coefficient", "OLS Std Error", "OLS t-Statistic", "OLS p-Value",
                              "OLS R-squared", "OLS Adjusted R-squared", "OLS F-Statistic", 
                              "OLS AIC", "OLS BIC", "Durbin-Watson", 
                              "Correlation with Target", "Correlation p-Value",
                              "OLS Hypothesis Outcome", "OLS Correlation Test Outcome"]:
                    ols_dict[field] = np.nan
                ols_dict.update(common_stats)
            else:
                ols_dict = {"Factor": factor}
                try:
                    X_ols = sm.add_constant(df_temp["x"])
                    ols_model = sm.OLS(df_temp["y"], X_ols).fit()
                    ols_dict["OLS Coefficient"]        = ols_model.params["x"]
                    ols_dict["OLS Std Error"]          = ols_model.bse["x"]
                    ols_dict["OLS t-Statistic"]        = ols_model.tvalues["x"]
                    ols_dict["OLS p-Value"]            = ols_model.pvalues["x"]
                    ols_dict["OLS R-squared"]          = ols_model.rsquared
                    ols_dict["OLS Adjusted R-squared"] = ols_model.rsquared_adj
                    ols_dict["OLS F-Statistic"]        = ols_model.fvalue if hasattr(ols_model, "fvalue") else np.nan
                    ols_dict["OLS AIC"]                = ols_model.aic
                    ols_dict["OLS BIC"]                = ols_model.bic
                    ols_dict["Durbin-Watson"]          = sm.stats.stattools.durbin_watson(ols_model.resid)
                except Exception as e:
                    print(f"OLS regression failed for factor '{factor}': {e}")
                    for key in ["OLS Coefficient", "OLS Std Error", "OLS t-Statistic", "OLS p-Value",
                                "OLS R-squared", "OLS Adjusted R-squared", "OLS F-Statistic", 
                                "OLS AIC", "OLS BIC", "Durbin-Watson"]:
                        ols_dict[key] = np.nan
                try:
                    corr_coef, corr_p = pearsonr(df_temp["x"], df_temp["y"])
                    ols_dict["Correlation with Target"] = corr_coef
                    ols_dict["Correlation p-Value"]     = corr_p
                except Exception as e:
                    print(f"Correlation computation failed for factor '{factor}' in OLS: {e}")
                    ols_dict["Correlation with Target"] = np.nan
                    ols_dict["Correlation p-Value"]     = np.nan
                
                # Verdict for regression significance.
                if not np.isnan(ols_dict.get("OLS p-Value", np.nan)):
                    ols_dict["OLS Hypothesis Outcome"] = ("Confirmed" if ols_dict["OLS p-Value"] < 0.05 
                                                          else "Failed")
                else:
                    ols_dict["OLS Hypothesis Outcome"] = "NA"
                
                # Verdict for correlation test.
                if not np.isnan(ols_dict.get("Correlation p-Value", np.nan)):
                    ols_dict["OLS Correlation Test Outcome"] = ("Confirmed" if ols_dict["Correlation p-Value"] < 0.05 
                                                                else "Failed")
                else:
                    ols_dict["OLS Correlation Test Outcome"] = "NA"
                
                # Check expected sign.
                exp_sign = expected_signs.get(factor, "NA")
                if exp_sign in ["positive", "negative"]:
                    if not np.isnan(ols_dict.get("OLS Coefficient", np.nan)):
                        ols_dict["Sign Correctness"] = (ols_dict["OLS Coefficient"] > 0) if exp_sign=="positive" else (ols_dict["OLS Coefficient"] < 0)
                    else:
                        ols_dict["Sign Correctness"] = np.nan
                else:
                    ols_dict["Sign Correctness"] = "NA"
                
                ols_dict.update(common_stats)
            
            ols_results.append(ols_dict)
        
        # --- B. GLM Analysis (Logit Link on Scaled Target) ---
        if "glm" in model_types:
            df_temp = pd.DataFrame({
                "x": factor_data[factor],
                "y": y_glm
            }).dropna()
            
            if df_temp["x"].nunique() < 2 or df_temp.empty:
                glm_dict = {"Factor": factor}
                for field in ["GLM Coefficient", "GLM Std Error", "GLM z-Value", "GLM p-Value",
                              "GLM Pseudo R-squared", "GLM AIC", "GLM BIC",
                              "Correlation with Target", "Correlation p-Value",
                              "GLM Hypothesis Outcome", "GLM Correlation Test Outcome"]:
                    glm_dict[field] = np.nan
                glm_dict.update(common_stats)
            else:
                glm_dict = {"Factor": factor}
                try:
                    X_glm = sm.add_constant(df_temp["x"])
                    glm_model = sm.GLM(df_temp["y"], X_glm, 
                                       family=sm.families.Binomial(sm.families.links.Logit())).fit()
                    glm_dict["GLM Coefficient"] = glm_model.params["x"]
                    glm_dict["GLM Std Error"]   = glm_model.bse["x"]
                    glm_dict["GLM z-Value"]     = glm_model.tvalues["x"]
                    glm_dict["GLM p-Value"]     = glm_model.pvalues["x"]
                    glm_dict["GLM AIC"]         = glm_model.aic
                    if hasattr(glm_model, "bic_llf"):
                        glm_dict["GLM BIC"] = glm_model.bic_llf
                    else:
                        glm_dict["GLM BIC"] = glm_model.bic
                    X_null = np.ones((len(df_temp), 1))
                    glm_null = sm.GLM(df_temp["y"], X_null, 
                                      family=sm.families.Binomial(sm.families.links.Logit())).fit()
                    glm_dict["GLM Pseudo R-squared"] = 1 - (glm_model.llf / glm_null.llf)
                except Exception as e:
                    print(f"GLM regression failed for factor '{factor}': {e}")
                    for key in ["GLM Coefficient", "GLM Std Error", "GLM z-Value", "GLM p-Value",
                                "GLM Pseudo R-squared", "GLM AIC", "GLM BIC"]:
                        glm_dict[key] = np.nan
                try:
                    corr_coef, corr_p = pearsonr(df_temp["x"], df_temp["y"])
                    glm_dict["Correlation with Target"] = corr_coef
                    glm_dict["Correlation p-Value"]     = corr_p
                except Exception as e:
                    print(f"Correlation computation failed for factor '{factor}' in GLM: {e}")
                    glm_dict["Correlation with Target"] = np.nan
                    glm_dict["Correlation p-Value"]     = np.nan
                
                if not np.isnan(glm_dict.get("GLM p-Value", np.nan)):
                    glm_dict["GLM Hypothesis Outcome"] = ("Confirmed" if glm_dict["GLM p-Value"] < 0.05 
                                                          else "Failed")
                else:
                    glm_dict["GLM Hypothesis Outcome"] = "NA"
                
                if not np.isnan(glm_dict.get("Correlation p-Value", np.nan)):
                    glm_dict["GLM Correlation Test Outcome"] = ("Confirmed" if glm_dict["Correlation p-Value"] < 0.05 
                                                                else "Failed")
                else:
                    glm_dict["GLM Correlation Test Outcome"] = "NA"
                
                exp_sign = expected_signs.get(factor, "NA")
                if exp_sign in ["positive", "negative"]:
                    if not np.isnan(glm_dict.get("GLM Coefficient", np.nan)):
                        glm_dict["Sign Correctness"] = (glm_dict["GLM Coefficient"] > 0) if exp_sign=="positive" else (glm_dict["GLM Coefficient"] < 0)
                    else:
                        glm_dict["Sign Correctness"] = np.nan
                else:
                    glm_dict["Sign Correctness"] = "NA"
                
                glm_dict.update(common_stats)
            
            glm_results.append(glm_dict)
    
    # 4. Prepare output.
    if separate_tables:
        return {"OLS": pd.DataFrame(ols_results), "GLM": pd.DataFrame(glm_results)}
    else:
        # If both models are requested, merge the two DataFrames on Factor.
        if "ols" in model_types and "glm" in model_types:
            df_ols = pd.DataFrame(ols_results)
            df_glm = pd.DataFrame(glm_results)
            combined = pd.merge(df_ols, df_glm, on="Factor", suffixes=(" (OLS)", " (GLM)"))
            return combined
        elif "ols" in model_types:
            return pd.DataFrame(ols_results)
        elif "glm" in model_types:
            return pd.DataFrame(glm_results)

# Example Usage:
if __name__ == "__main__":
    np.random.seed(42)
    dates = pd.date_range(start="2020-01-01", periods=100, freq="MS")
    dummy_data = pd.DataFrame({
        "date": dates,
        "factor_1": np.random.normal(0, 1, 100),
        "factor_2": np.random.normal(0, 1, 100),
        "factor_3": np.random.normal(0, 1, 100)
    })
    
    target = 0.5 * dummy_data["factor_1"] - 0.2 * dummy_data["factor_2"] + np.random.normal(0, 1, 100)
    
    expected_signs = {"factor_1": "positive", "factor_2": "negative", "factor_3": "positive"}
    
    # Run the analysis using both models and return separate tables.
    results = perform_univariate_analysis(data=dummy_data, target=target, 
                                          expected_signs=expected_signs, 
                                          model_types=["ols", "glm"],
                                          separate_tables=True)


OLS Analysis:
  Factor  OLS Coefficient  OLS Std Error  OLS t-Statistic  OLS p-Value  OLS R-squared  OLS Adjusted R-squared  OLS F-Statistic    OLS AIC    OLS BIC  Durbin-Watson  Correlation with Target  Correlation p-Value OLS Hypothesis Outcome OLS Correlation Test Outcome  Sign Correctness      Mean      Std       Min      Max  Skewness  Kurtosis  ADF Test Statistic  ADF p-Value  ADF Lags         ADF Test Outcome
factor_1         0.362936       0.100011         3.628940     0.000454       0.118461                0.109466        13.169207 265.520275 270.730615       1.982370                 0.344182             0.000454              Confirmed                    Confirmed              True -0.103847 0.908168 -2.619745 1.852278 -0.177948 -0.100977          -10.084426 1.165504e-17         0 H0 rejected (Stationary)
factor_2        -0.281285       0.097376        -2.888631     0.004763       0.078464                0.069061         8.344191 269.957526 275.167866       1.821212           

In [3]:
results['OLS']

Unnamed: 0,Factor,OLS Coefficient,OLS Std Error,OLS t-Statistic,OLS p-Value,OLS R-squared,OLS Adjusted R-squared,OLS F-Statistic,OLS AIC,OLS BIC,Durbin-Watson,Correlation with Target,Correlation p-Value,OLS Hypothesis Outcome,OLS Correlation Test Outcome,Sign Correctness,Mean,Std,Min,Max,Skewness,Kurtosis,ADF Test Statistic,ADF p-Value,ADF Lags,ADF Test Outcome
0,factor_1,0.362936,0.100011,3.62894,0.000454,0.118461,0.109466,13.169207,265.520275,270.730615,1.98237,0.344182,0.000454,Confirmed,Confirmed,True,-0.103847,0.908168,-2.619745,1.852278,-0.177948,-0.100977,-10.084426,1.165504e-17,0,H0 rejected (Stationary)
1,factor_2,-0.281285,0.097376,-2.888631,0.004763,0.078464,0.069061,8.344191,269.957526,275.167866,1.821212,-0.280114,0.004763,Confirmed,Confirmed,True,0.022305,0.953669,-1.918771,2.720169,0.386984,0.030979,-10.875459,1.335231e-19,0,H0 rejected (Stationary)
2,factor_3,0.086154,0.088793,0.970283,0.334294,0.009515,-0.000592,0.941449,277.172792,282.383133,1.969253,0.097546,0.334294,Failed,Failed,True,0.064896,1.084283,-3.241267,3.852731,0.177762,1.125855,-11.100477,3.8766689999999997e-20,0,H0 rejected (Stationary)


In [4]:
results['GLM']

Unnamed: 0,Factor,GLM Coefficient,GLM Std Error,GLM z-Value,GLM p-Value,GLM AIC,GLM BIC,GLM Pseudo R-squared,Correlation with Target,Correlation p-Value,GLM Hypothesis Outcome,GLM Correlation Test Outcome,Sign Correctness,Mean,Std,Min,Max,Skewness,Kurtosis,ADF Test Statistic,ADF p-Value,ADF Lags,ADF Test Outcome
0,factor_1,0.319441,0.227764,1.402513,0.160762,100.244595,105.454936,0.02059,0.344182,0.000454,Failed,Confirmed,True,-0.103847,0.908168,-2.619745,1.852278,-0.177948,-0.100977,-10.084426,1.165504e-17,0,H0 rejected (Stationary)
1,factor_2,-0.245925,0.214808,-1.144861,0.252267,100.93236,106.1427,0.013592,-0.280114,0.004763,Failed,Confirmed,True,0.022305,0.953669,-1.918771,2.720169,0.386984,0.030979,-10.875459,1.335231e-19,0,H0 rejected (Stationary)
2,factor_3,0.07462,0.186135,0.400894,0.688498,102.106721,107.317062,0.001641,0.097546,0.334294,Failed,Failed,True,0.064896,1.084283,-3.241267,3.852731,0.177762,1.125855,-11.100477,3.8766689999999997e-20,0,H0 rejected (Stationary)
