# Estimate Gaussian Graphical Models using adaptive model average procedure
For more details, see: https://skggm.github.io/skggm/tour

Imports

In [None]:
from inverse_covariance import (
    QuicGraphicalLasso,
    QuicGraphicalLassoCV,
    QuicGraphicalLassoEBIC,
    AdaptiveGraphicalLasso,
    ModelAverage,
)

import sys
import numpy as np
import tabulate
import time

from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_sparse_spd_matrix
from sklearn.covariance import GraphicalLassoCV, ledoit_wolf
import matplotlib.pyplot as plt
import os
import sys
import re
import glob
import ast
import os.path as op
import pickle
import scipy
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

from collections import defaultdict

from copy import deepcopy
import copy

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.utils import resample
from sklearn.covariance import GraphicalLassoCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression


import warnings
warnings.filterwarnings("ignore")

In [None]:
def adaptive_model_average_sklearn(X, method, penalization='random', n_trials=100, metric='log_likelihood', support_thresh=0.5, cv=10, lam=None, alphas=np.logspace(-5, 1, num=20)):
    """Run ModelAverage in default mode (QuicGraphicalLassoCV) to obtain proportion
    matrix.

    NOTE:  Only method = 'binary' really makes sense in this case.
    """
    n_trials = n_trials

    # if penalization is random, first find a decent scalar lam_ to build
    # random perturbation matrix around. lam doesn't matter for fully-random.
    if lam is None:
        cv_model = GraphicalLassoCV(
            alphas=alphas,
            cv=cv
        )
        cv_model.fit(X)
        lam = cv_model.alpha_
        print("   lam: {}".format(lam))
    else:
        lam = lam
        
    model = AdaptiveGraphicalLasso(
        estimator=ModelAverage(
            n_trials=n_trials, penalization=penalization, lam=lam, n_jobs=10, support_thresh=support_thresh,
        ),
        method=method,
    )
    model.fit(X)
    lam_norm_ = np.linalg.norm(model.estimator_.lam_)
    return model.estimator_.covariance_, model.estimator_.precision_, lam_norm_, model.estimator


def learn_graph_structure_adaptive_average_sklearn(df, n_trials=100, penalization='random',score_metric="log_likelihood", cv=10, lam=None,threshold=0.5, alphas=np.logspace(-5, 1, num=20)):
    
    # standardize the time series: using correlations rather than covariance
    # former is more efficient for structure recovery
    X = df.to_numpy()
    X -= X.mean(axis=0)
    X /= X.std(axis=0)

    cov_adaptive, prec_adaptive, lam_adaptive, estimator = adaptive_model_average_sklearn(
        X, 
        penalization='random',
        method='binary', 
        n_trials=n_trials, 
        metric=score_metric,
        support_thresh=threshold,
        cv=cv,
        lam=lam,
        alphas=alphas,
    )
    
    covariance_matrix_df = pd.DataFrame(cov_adaptive, columns = df.columns, index = df.columns)
    precision_matrix_df = pd.DataFrame(prec_adaptive, columns = df.columns, index = df.columns)
    support_matrix_df = pd.DataFrame(estimator.support_, columns = df.columns, index = df.columns)
    proportion_matrix_df = pd.DataFrame(estimator.proportion_, columns = df.columns, index = df.columns)
    
    print(f"Adaptive lam: {lam_adaptive}")
    
    return covariance_matrix_df, precision_matrix_df, support_matrix_df, proportion_matrix_df, estimator

## Load data

In [None]:
# dataset = train | test
dataset = 'train'
test = False if dataset == 'train' else True

In [None]:
ern_data_df = pd.read_pickle(f"data/models_pickles_new/ern_crn_models_train_id_clean.pkl")
ern_cov_fal_data_df = pd.read_pickle(f"data/models_pickles_new/ern_crn_cov_fal_models_train_id_clean.pkl")

datasets = [
    ern_data_df, 
    ern_cov_fal_data_df, 
]

### Read consistency measures

In [None]:
within_consistency_ern_df = pd.read_csv(f'data/consistency/{dataset}/ern_amplitude_within.csv')
internal_consistency_ern_df = pd.read_csv(f'data/consistency/{dataset}/ern_amplitude_consistency.csv')
within_consistency_crn_df = pd.read_csv(f'data/consistency/{dataset}/crn_amplitude_within.csv')
internal_consistency_crn_df = pd.read_csv(f'data/consistency/{dataset}/crn_amplitude_consistency.csv')


consistency_ern_df = within_consistency_ern_df.merge(internal_consistency_ern_df, on ='Unnamed: 0')
consistency_ern_df = consistency_ern_df[consistency_ern_df['pipeline_x'] == 'Fz']

consistency_crn_df = within_consistency_crn_df.merge(internal_consistency_crn_df, on ='Unnamed: 0')
consistency_crn_df = consistency_crn_df[consistency_crn_df['pipeline_x'] == 'Fz']

consistency_df = consistency_ern_df.merge(consistency_crn_df, on='id', suffixes=('_ern', '_crn'))
consistency_df.head()

### Filter participants on consistency measures

In [None]:
exclude_consistency = True

In [None]:
threshold_consistency = 0.6

if exclude_consistency:
    for i in range(0, len(datasets)):
        datasets[i] = datasets[i].merge(consistency_df[['internal_variance_ern', 'internal_variance_crn', 'id']], on='id')
        datasets[i] = datasets[i][
            (datasets[i]['internal_variance_ern'] > threshold_consistency) &
            (datasets[i]['internal_variance_crn'] > threshold_consistency)
        ]

## Run Adaptive Model Average

Parameters

In [None]:
score_metric="log_likelihood"
n_trials = 10000
alphas = np.linspace(0.01, 0.1, 20)
cv = 3
threshold = 0.65

#### ERN and CRN model

Prepare datasets: remove skewed Washing and Neutralizing

In [None]:
no_cov_datasets = [
    datasets[0], 
]

columns_to_drop = ['WASH', 'NEU', 'id', 'internal_variance_ern', 'internal_variance_crn'] if exclude_consistency else ['WASH', 'NEU']

datasets_no_skewed_scales = []
for model in no_cov_datasets:
    this_dataset = model.drop(columns=columns_to_drop)
    datasets_no_skewed_scales.append(this_dataset)

In [None]:
precision_matrixes = []
covariance_matrixes = []
support_matrixes = []
proportion_matrixes = []
average_estimators = []

for model in datasets_no_skewed_scales:
    covariance_matrix_df, precision_matrix_df, support_matrix_df, proportion_matrix_df, estimator  = learn_graph_structure_adaptive_average_sklearn(
        model, 
        penalization='random',
        n_trials=n_trials,
        score_metric=score_metric,
        cv=cv,
        lam=None,
        alphas=alphas,
        threshold=threshold,
    )
    precision_matrixes.append(precision_matrix_df)
    covariance_matrixes.append(covariance_matrix_df)
    support_matrixes.append(support_matrix_df)
    proportion_matrixes.append(proportion_matrix_df)
    average_estimators.append(estimator)

In [None]:
precision_matrixes[0]

Save  the precision and covariance matrices

In [None]:
for index, precision_matrix in enumerate(precision_matrixes):
    precision_matrix.to_pickle(f"new_data/precision_matrixes/{dataset}/precision_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_ern_crn.pkl")

for index, covariance_matrix in enumerate(covariance_matrixes):
    covariance_matrix.to_pickle(f"new_data/covariance_matrixes/{dataset}/covariance_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_ern_crn.pkl")    

Save the support and proportion matrices

In [None]:
for index, support_matrix in enumerate(support_matrixes):
    support_matrix.to_pickle(f"new_data/support_proportion/{dataset}/support_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_ern_crn.pkl")

for index, proportion_matrix in enumerate(proportion_matrixes):
    proportion_matrix.to_pickle(f"new_data/support_proportion/{dataset}/proportion_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_ern_crn.pkl")    

Save the estimators from N trials

In [None]:
for index, average_estimator in enumerate(average_estimators):
    estimators_df = pd.DataFrame({"estimators": average_estimator.estimators_})
    estimators_df.to_pickle(f"new_data/n_estimators/{dataset}/stability_selection_estimators_{index+1}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_ern_crn.pkl")

#### ERN with covariates and CRN with covariates

Prepare datasets: remove skewed Washing and Neutralizing, and remove gender from the analysis

In [None]:
no_cov_datasets = [
    datasets[1], 
]

columns_to_drop = ['WASH', 'NEU', 'Sex', 'id', 'internal_variance_ern', 'internal_variance_crn'] if exclude_consistency else ['WASH', 'NEU','Sex']

datasets_no_skewed_scales = []
for model in no_cov_datasets:
    this_dataset = model.drop(columns=columns_to_drop)
    datasets_no_skewed_scales.append(this_dataset)

In [None]:
precision_matrixes = []
covariance_matrixes = []
support_matrixes = []
proportion_matrixes = []
average_estimators = []

for model in datasets_no_skewed_scales:
    covariance_matrix_df, precision_matrix_df, support_matrix_df, proportion_matrix_df, estimator = learn_graph_structure_adaptive_average_sklearn(
        model, 
        penalization='random',
        n_trials=n_trials,
        score_metric=score_metric,
        cv=cv,
        lam=None,
        alphas=alphas,
        threshold=threshold,
    )
    precision_matrixes.append(precision_matrix_df)
    covariance_matrixes.append(covariance_matrix_df)
    support_matrixes.append(support_matrix_df)
    proportion_matrixes.append(proportion_matrix_df)
    average_estimators.append(estimator)

In [None]:
precision_matrixes[0]

Save the precision and covariance matrices

In [None]:
for index, precision_matrix in enumerate(precision_matrixes):
    precision_matrix.to_pickle(f"new_data/precision_matrixes/{dataset}/precision_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_no_sex_cov_ern_crn.pkl")

for index, covariance_matrix in enumerate(covariance_matrixes):
    covariance_matrix.to_pickle(f"new_data/covariance_matrixes/{dataset}/covariance_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewed_no_sex_cov_ern_crn.pkl")    

Save the support and proportion matrices

In [None]:
for index, support_matrix in enumerate(support_matrixes):
    support_matrix.to_pickle(f"new_data/support_proportion/{dataset}/support_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_no_sex_cov_ern_crn.pkl")

for index, proportion_matrix in enumerate(proportion_matrixes):
    proportion_matrix.to_pickle(f"new_data/support_proportion/{dataset}/proportion_matrix_{index}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewed_no_sex_cov_ern_crn.pkl")    

Save the estimators from N trials

In [None]:
for index, average_estimator in enumerate(average_estimators):
    estimators_df = pd.DataFrame({"estimators": average_estimator.estimators_})
    estimators_df.to_pickle(f"new_data/n_estimators/{dataset}/stability_selection_estimators_{index+1}_ntrials_{n_trials}_sklearn_cv{cv}_th{str(threshold)}_without_skewd_no_sex_cov_ern_crn.pkl")