# L2: stochastic block model and community detection

Here we explore 3 particular topics:
- mixed-membership
- model selection
- adding node attributes

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import time

In [None]:
import sys
sys.path.append('../../../src/')
import tools as tl
import plot as viz

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

colormap = plt.cm.tab20
colors = {i: colormap(i) for i in range(20)}

In [None]:
from probinet.input.loader import build_adjacency_from_file
from probinet.input.stats import print_graph_stats
from probinet.models.mtcov import MTCOV
# from probinet.visualization.plot import extract_bridge_properties

In [None]:
outdir_fig = '../figures/'
lecture_id = 2

In [None]:
seed = 10
prng = np.random.RandomState(seed)

# 1. Trade network
Let's consider the trade network of the previous lecture. 

## 1.1 Import data

In [None]:
indir = '../../../data/outdir/wto/'
filename = 'wto_aob.csv'
infile = f"{indir}{filename}"
df = pd.read_csv(infile)
df.head()

In [None]:
source = 'reporter_name'
target = 'partner_name'
weight = 'weight'

In [None]:
undirected = True
force_dense = True
binary = True
data = build_adjacency_from_file(
    f"{indir}{filename}",
    ego=source,
    alter=target,
    sep=",",
    undirected=undirected,
    force_dense=force_dense,
    binary=binary,
    header=0,
)
# Print the names of the coordinates in the namedtuple gdata
print(data._fields)

nodeLabel2Id = {k:i for i,k in enumerate(data.nodes)}
nodeId2Label = {i:k for i,k in enumerate(data.nodes)}

Y = data.adjacency_tensor

plt.figure(figsize=(2,2))

nmax = 500
node_order = np.argsort(-Y[0].sum(axis=1))
viz.plot_matrix(Y,node_order=node_order[:nmax],title=f"Y")

plt.tight_layout()

Setup variables for plotting

In [None]:
ms = 10
# node_size = [np.log(graph.degree[i]) * ms + 100 for i in data.nodes]
# position = nx.spring_layout(data.graph_list[0], iterations=100, seed = seed)

node_size = [np.log(data.graph_list[0].degree[i]) * ms + 20 for i in data.nodes]
position = tl.get_custom_node_positions(data.graph_list[0])

## 1.2 Run a mixed-membership model

We use [`MTCOV`](https://doi.org/10.1038/s41598-020-72626-y), contained in `probinet`. This can also take in input node attributes, but we ignore this for the moment.

   - Contisciani M., Power E.A. and De Bacco C. _Community detection with node attributes in multilayer networks_. Scientific reports, 10(1):15736, 2020.

In [None]:
u = {} # to store the results

### 1.2.1 Run another algorithm to use it as a bias

In [None]:
algo = 'louvain'

G = nx.from_numpy_array(data.adjacency_tensor[0],edge_attr=weight)
G.number_of_nodes(), G.number_of_edges()

seed = 10
resolution = 1.2 # the higher, the more and smaller the communities
louvain = nx.community.louvain_communities(G, seed=seed,weight=weight,resolution=resolution)

u[algo] = tl.from_louvain_to_u(louvain)
print(u[algo].shape)

### 1.2.2 Run MTCOV

In [None]:
config_dict = {
    "assortative": True,
    "end_file": "_mtcov",
    "out_folder": '../../../data/outdir/wto/',
    "out_inference": True,
    "undirected": True,
    "rseed": 10
}
num_realizations = 20
plot_loglik = False

In [None]:
gammas = [0.0,0.3,0.5,0.7,0.9, 0.99] # possible values for hyper-parameter `gamma`

In [None]:
model = MTCOV(num_realizations=num_realizations, plot_loglik=plot_loglik)

X = np.copy(u['louvain']) # we can choose what dummy covariate to give in input. Here we use the result of another algorithm, pick the one you like most
# X = np.zeros((len(data.nodes), 4)) # uncomment this if you want to give dummy data
data = data._replace(design_matrix=X)

K = 6
params = {}
for gamma in gammas:
    params[gamma] = model.fit(data, K=K, gamma=gamma, rng=np.random.default_rng(config_dict["rseed"]), **config_dict)

    algo = f'mtcov_{gamma}'
    u[algo] = params[gamma][0]

## 1.3 Analyze results

In [None]:
u.keys()

### 1.3.1 Communities

In [None]:
algo = 'mtcov_0.0'

plot_labels = False
filename0 = f'WTO_network_{algo}_{plot_labels}'

node_labels = {}
for n,d in list(data.graph_list[0].degree()):
    if d > 4: node_labels[n] = n
    if np.count_nonzero(u[algo][nodeLabel2Id[n]]) > 1:
        node_labels[n] = n
        
plt.figure(figsize=(16,10))

nx.draw_networkx_labels(data.graph_list[0],position, font_size=8, alpha=0.8, labels=node_labels)
nx.draw_networkx_edges(data.graph_list[0],pos=position,width=0.1)
# plt.title(algo)
plt.axis('off')

if algo.startswith('mtcov'):
    ax = plt.gca()
    for j, n in enumerate(data.graph_list[0].nodes()):
        wedge_sizes, wedge_colors = viz.extract_bridge_properties(j, colors, u[algo])
        if len(wedge_sizes) > 0:
            _ = plt.pie(
                wedge_sizes,
                center=position[n],
                colors=wedge_colors,
                radius=(node_size[j]) * 0.001
            )
            ax.axis("equal")

plt.tight_layout()


filename = tl.get_filename(filename0,lecture_id=lecture_id)
outdir = "../figures/"
tl.savefig(plt,outfile = filename,outdir = outdir)

In [None]:
node_labels = {}
for n,d in list(data.graph_list[0].degree()):
    if d > 4: node_labels[n] = n
        
plt.figure(figsize=(16,10))
L = len(u.keys())
n_cols = 2
n_rows = int(np.ceil(L / n_cols))

for i, p in enumerate(u.keys()):
    plt.subplot(n_rows,n_cols,i+1)
    nx.draw_networkx_nodes(data.graph_list[0],position, node_size=node_size, node_color=tl.get_node_colors(colors, u[p]))
    nx.draw_networkx_labels(data.graph_list[0],position, font_size=8, alpha=0.8, labels=node_labels)
    nx.draw_networkx_edges(data.graph_list[0],pos=position,width=0.1)
    plt.title(p)
    plt.axis('off')

plt.show()

### 1.3.2 Adjacency matrices

In [None]:
f, axarr = plt.subplots(1, len(u.keys()),figsize=(18,6))

for i,algo in enumerate(u.keys()):
    node_order = tl.extract_node_order(u[algo])
    viz.plot_matrix(Y,node_order=node_order,ax=axarr[i],title=f"{algo}",vmax = 1e-3,vmin=0)

plt.tight_layout()

### 1.3.3. Focus on a specific partition and zoom in

In [None]:
nodeLabel2size = {i:np.log(data.graph_list[0].degree[i]) * ms +300 for i in list(data.graph_list[0].nodes())}

Play with the algorithm and check:
- What are the **mixed-memership** nodes in the various results?
- How do they change with algorithm?

In [None]:
u.keys()

In [None]:
algo = 'mtcov_0.5'
communities = np.argmax(u[algo],axis=1)


plt.figure(figsize=(14,8))
K = u[algo].shape[-1]
n_cols = 3
n_rows = int(np.ceil(K / n_cols))
for i, k in enumerate(np.arange(u[algo].shape[-1])):
    community = np.where(communities==k)[0]
    H = data.graph_list[0].subgraph([nodeId2Label[n] for n in community])
    c = colors[i]
    # p = nx.spring_layout(H, iterations=100,k=0.1)
    p = nx.circular_layout(H)
    ns = [nodeLabel2size[n] for n in H.nodes()]
    plt.subplot(n_rows,n_cols,i+1)
    nx.draw_networkx_edges(H,pos=p, width=0.1)
    nx.draw_networkx_labels(H,pos=p, font_size=8, alpha=0.8)
    if algo.startswith('mtcov'):
        ax = plt.gca()
        for j, n in enumerate(H.nodes()):
            wedge_sizes, wedge_colors = viz.extract_bridge_properties(j, colors, u[algo][communities==k])
            if len(wedge_sizes) > 0:
                _ = plt.pie(
                    wedge_sizes,
                    center=p[n],
                    colors=wedge_colors,
                    radius=(ns[j]) * 0.0003
                )
                ax.axis("equal")
    else:
        nx.draw_networkx_nodes(H,pos=p, node_size=ns, node_color=c)
        
    
    plt.title(k)
    plt.axis('off')
plt.tight_layout()

## 2. Model selection

What is the best among these results?  
To find out, we need to run a **model selection** criteria.  
Here we focus on **cross-validation** (CV). For this, we need to:
1. **hide** part of the dataset, splitting into train and test sets.
2. **learn** model parameters by fitting on the **training** set
3. **measure performance** metric on the **test** set


### 2.1 Hide part of the data

In [None]:
def shuffle_indices_symmetric(shape, seed: int = 10):
    '''
    To extract a symmetric mask containing (A_ij,A_ji)
    '''
    L = shape[0]
    N = shape[-1]
    n_samples = int(N * (N-1) * 0.5) # upper triangle excluding diagonal
    indices = [np.arange(n_samples) for l in range(L)]
    rng = np.random.RandomState(seed)
    for l in range(L):rng.shuffle(indices[l])
    return indices
    
def extract_mask_symmetric_kfold(indices, N, NFold: int = 5):
    '''
    Symmetric mask: contains pairs (i,j) and (j,i) --> for undirected networks
    KFold : no train/test sets intersect across the K folds 
    '''
    L = len(indices)
    mask = {f:np.zeros((L,N,N),dtype=bool) for f in range(NFold)}
    for fold in range(NFold):
        for l in range(L):
            n_samples = len(indices[l])
            test = indices[l][fold * (n_samples // NFold):(fold + 1) * (n_samples // NFold)]
            # train = list(set(indices).difference(set(test)))
            mask0 = np.zeros((n_samples),dtype=bool)
            mask0[test] = 1
            mask[fold][l][np.triu_indices(N, k = 1)] = mask0
            mask[fold][l] = mask[fold][l] + mask[fold][l].T 
    return mask
    
def extract_mask(shape, out_mask = False, outfolder: str = '../../../data/output/tests/cv/', outfile = None,
                seed: int = 10, NFold: int = 5):

    indices = shuffle_indices_symmetric(shape, seed=seed)
    mask = extract_mask_symmetric_kfold(indices, shape[-1],NFold=NFold)
   

    if out_mask:  # output the masks into files
        for fold in mask.keys():
            outmask = outfolder + outfile + '_' + str(fold)
            np.savez_compressed(outmask + '.npz', mask = np.where(mask[fold] > 0))
            # To load: mask = np.load('mask_f0.npz'), e.g. print(np.array_equal(maskG, mask['maskG']))
            print('Masks saved in:', outmask)

    return mask

def extract_train_test_sparse(Y, maskG, fold: int = 0):

    L = Y.shape[0]
    N = Y.shape[-1]
    subs_test = (Y.subs[0][maskG[fold]], Y.subs[1][maskG[fold]], Y.subs[2][maskG[fold]])
    vals_test = Y.vals[maskG[fold]]

    if type(maskG[fold][0].item()) == int:
        number_nnz = Y.subs[0].shape[0]
        mask_train = np.array([i for i in np.arange(number_nnz) if i not in maskG[fold]])
    elif type(maskG[fold][0].item()) == bool:
        mask_train = np.logical_not(maskG[fold])
    else:
        raise ValueError(f"Type of mask entries should be int or bool. It is {type(maskG[fold][0].item())}!")
    subs_train = (Y.subs[0][mask_train], Y.subs[1][mask_train], Y.subs[2][mask_train])
    vals_train = Y.vals[mask_train]
    # Y_train = skt.sptensor(subs_train, vals_train, shape=(L, self.N, self.N), dtype=vals_train.dtype)

    return Y_train,Y_test



In [None]:
seed = 10
cv_mask = extract_mask(data.adjacency_tensor.shape, seed = seed, )
cv_mask.keys(), cv_mask[0].shape

Check that we are correctly splitting the folds

In [None]:
for k,v in cv_mask.items():
    assert np.allclose(np.where(v>0)[1].shape[0] ,  Y.shape[1] * (Y.shape[1] - 1) / 5)

Extract edges in the test set.  
First, let's fix a `fold`


In [None]:
fold = 0

In [None]:
nodeLabel2Id = {n:i for i,n in enumerate(data.nodes)}
nodeId2Label = {i:n for i,n in enumerate(data.nodes)}

Build test dataframe

In [None]:
subs = np.where(cv_mask[fold] > 0)[1:]
nodes1 = [nodeId2Label[i] for i in subs[0]]
nodes2 = [nodeId2Label[i] for i in subs[1]]
ws = data.adjacency_tensor[cv_mask[fold]].astype(int)

df_test = pd.DataFrame({source: nodes1, target: nodes2, weight: ws})

Check that it makes sense (symmetry)

In [None]:
df_test[df_test[weight]>0].head(n=10)    

Build training dataframe

In [None]:
df_train = df.merge(df_test[df_test[weight]>0], how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)
df_train = pd.concat([df_train, df_test[df_test[weight]==0]],axis=0)
assert len(df) == len(df_train[df_train[weight]>0]) + len(df_test[df_test[weight]>0])

Save into file

In [None]:
outdir = '../../../data/outdir/wto/cv/'
filename = f'wto_aob_fold{fold}_train.csv'
tl.save_df_to_file(df_train,filename =filename, outdir=outdir)
filename = f'wto_aob_fold{fold}_test.csv'
tl.save_df_to_file(df_test,filename =filename, outdir=outdir)

### 2.2 Run inference on training 
We repeat the same pipeline we had before with the full dataset

In [None]:
outdir = '../../../data/outdir/wto/cv/'
filename = f'wto_aob_fold{fold}_train.csv'

undirected = True
force_dense = True
binary = True
data_cv = build_adjacency_from_file(
    f"{outdir}{filename}",
    ego=source,
    alter=target,
    sep=",",
    undirected=undirected,
    force_dense=force_dense,
    binary=binary,
    header=0,
)
# Print the names of the coordinates in the namedtuple gdata
print(data_cv._fields)

Y_cv = data_cv.adjacency_tensor
print(Y_cv.shape)
plt.figure(figsize=(2,2))

nmax = 500
node_order = np.argsort(-Y_cv[0].sum(axis=1))
viz.plot_matrix(Y_cv,node_order=node_order[:nmax],title=f"Y")

plt.tight_layout()

In [None]:
ms = 10

node_size_cv = [np.log(data_cv.graph_list[0].degree[i]) * ms + 20 for i in data_cv.nodes]
position_cv = tl.get_custom_node_positions(data_cv.graph_list[0])

#### 2.2.1 First, let's learn with a deterministic algorithm (to use it as covariate for MTCOV)

In [None]:
u_cv = {}

In [None]:
algo = 'louvain'

G = nx.from_numpy_array(data_cv.adjacency_tensor[0],edge_attr=weight)
G.number_of_nodes(), G.number_of_edges()

seed = 10
resolution = 1.2 # the higher, the more and smaller the communities
louvain = nx.community.louvain_communities(G, seed=seed,weight=weight,resolution=resolution)

u_cv[algo] = tl.from_louvain_to_u(louvain)
print(u_cv[algo].shape)

#### 2.2.2 Run MTCOV 

In [None]:
gammas = [0.0,0.3,0.5,0.7,0.9,0.99]

In [None]:
model = MTCOV(num_realizations=num_realizations, plot_loglik=plot_loglik)

X = np.copy(u_cv['louvain']) # we can choose what dummy covariate to give in input. Here we use the result of another algorithm, pick the one you like most
# X = np.zeros((len(data.nodes), 4)) # uncomment this if you want to give dummy data
data_cv = data_cv._replace(design_matrix=X)

K = 10
params_cv = {}
for gamma in gammas:
    params_cv[gamma] = model.fit(data_cv, K=K, gamma=gamma, rng=np.random.default_rng(config_dict["rseed"]), **config_dict)

    algo = f'mtcov_{gamma}'
    u_cv[algo] = params_cv[gamma][0]

#### 2.3 Analyze results

In [None]:

nodeLabel2Id_cv = {k:i for i,k in enumerate(data_cv.nodes)}
nodeId2Label_cv = {i:k for i,k in enumerate(data_cv.nodes)}

 Check if number of nodes are the same

In [None]:
assert len(data_cv.nodes)==len(data.nodes)

In [None]:
node_labels_cv = {}
for n,d in list(data_cv.graph_list[0].degree()):
    if d > 4: node_labels_cv[n] = n
        
plt.figure(figsize=(16,10))
L = len(u_cv.keys())
n_cols = 2
n_rows = int(np.ceil(L / n_cols))

for i, p in enumerate(u_cv.keys()):
    plt.subplot(n_rows,n_cols,i+1)
    nx.draw_networkx_nodes(data_cv.graph_list[0],position, node_size=node_size_cv, node_color=tl.get_node_colors(colors, u_cv[p]))
    nx.draw_networkx_labels(data_cv.graph_list[0],position, font_size=8, alpha=0.8, labels=node_labels_cv)
    nx.draw_networkx_edges(data_cv.graph_list[0],pos=position,width=0.1)
    plt.title(p)
    plt.axis('off')


plt.show()

#### What do you observe?

#### Adjacency matrix

In [None]:
f, axarr = plt.subplots(1, len(u_cv.keys()),figsize=(18,6))

for i,algo in enumerate(u_cv.keys()):
    node_order = tl.extract_node_order(u_cv[algo])
    viz.plot_matrix(Y_cv,node_order=node_order,ax=axarr[i],title=f"{algo}",vmax = 1e-3,vmin=0)

plt.tight_layout()

### 3. Measure performance metric on the test set
We use AUC as metric and use `probinet` 


In [None]:

from probinet.evaluation.link_prediction import compute_link_prediction_AUC
from probinet.evaluation.expectation_computation import compute_mean_lambda0
from probinet.evaluation.likelihood import loglikelihood_network

#### 3.1 Compute the predicted adjacency tensor
Each model has its own way to compute the predicted `Y`.  
**Question**: what is the predicted Y of Louvain?

In [None]:
Y_pred = {algo: compute_mean_lambda0(v[0],v[1],v[2]) for algo, v in params_cv.items()}

In [None]:
f, axarr = plt.subplots(2, len(params_cv.keys()),figsize=(18,6))

for i,algo in enumerate(params_cv.keys()):
    node_order = tl.extract_node_order(u_cv[f"mtcov_{algo}"])
    viz.plot_matrix(Y_cv,node_order=node_order,ax=axarr[0,i],title=f"True: {algo}",vmax = 1e-3,vmin=0)
    viz.plot_matrix(Y_pred[algo],node_order=node_order,ax=axarr[1,i],title=f"Pred: {algo}",vmin=0)

    

plt.tight_layout()

In [None]:

auc = [np.round(compute_link_prediction_AUC(data.adjacency_tensor,y_pred, mask=np.logical_not(cv_mask[fold])),3) for a,y_pred in Y_pred.items()]
df_auc = pd.DataFrame({'algo':[a for a in Y_pred.keys()],'auc':auc})
df_auc


#### What do you observe?

#### 3.2 Use the test set!

In [None]:
outdir = '../../../data/outdir/wto/cv/'
filename = f'wto_aob_fold{fold}_test.csv'

undirected = True
force_dense = True
binary = True
data_test = build_adjacency_from_file(
    f"{outdir}{filename}",
    ego=source,
    alter=target,
    sep=",",
    undirected=undirected,
    force_dense=force_dense,
    binary=binary,
    header=0,
)

Y_test = data_test.adjacency_tensor

plt.figure(figsize=(2,2))

nmax = 500
node_order = np.argsort(-Y_test[0].sum(axis=1))
viz.plot_matrix(Y_test,node_order=node_order[:nmax],title=f"Y test")

plt.tight_layout()

In [None]:
assert len(data_test.nodes) ==  len(data_cv.nodes)

In [None]:
assert data_test.nodes == data_cv.nodes

In [None]:

auc = [np.round(compute_link_prediction_AUC(data.adjacency_tensor,y_pred, mask=cv_mask[fold]),3) for a,y_pred in Y_pred.items()]
df_auc_test = pd.DataFrame({'algo':[a for a in Y_pred.keys()],'auc_test':auc})
df_auc = df_auc.merge(df_auc_test,on='algo')
df_auc.rename(columns={'auc':'auc_train'},inplace=True)
df_auc

In [None]:
f, axarr = plt.subplots(2, len(params_cv.keys()),figsize=(18,6))

for i,algo in enumerate(params_cv.keys()):
    node_order = tl.extract_node_order(u_cv[f"mtcov_{algo}"])
    viz.plot_matrix(Y_test,node_order=node_order,ax=axarr[0,i],title=f"True: {algo}",vmax = 1e-3,vmin=0)
    viz.plot_matrix(Y_pred[algo],node_order=node_order,ax=axarr[1,i],title=f"Pred: {algo}",vmin=0)

plt.tight_layout()

#### 3.3 Alternative prediction metrics

Beside AUC, we can use other metrics. For instance:
- heldout loglikelihood, the log-likelihood on the test set
- binary cross-entropy or [log-loss](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html)

In [None]:
from scipy.special import factorial
from sklearn.metrics import log_loss

In [None]:
def get_loglikelihood(
    B: np.ndarray,
    u: np.ndarray,
    v: np.ndarray,
    w: np.ndarray,
    mask: np.ndarray = None,
) -> float:
    """
    Compute the log-likelihood for the network structure.

    Parameters
    ----------
    B : np.ndarray
        Graph adjacency tensor.
    u : np.ndarray
        Membership matrix (out-degree).
    v : np.ndarray
        Membership matrix (in-degree).
    w : np.ndarray
        Affinity tensor.
    mask : Optional[np.ndarray]
        Mask for selecting a subset in the adjacency tensor.

    Returns
    -------
    float
        Log-likelihood value for the network structure.
    """
    if mask is None:
        # Compute the expected adjacency tensor
        M = compute_mean_lambda0(u,v,w)
        logM = np.zeros(M.shape)
        logM[M > 0] = np.log(M[M > 0])
        return (B * logM).sum() - M.sum() -(np.log(factorial(B.astype(int)))).sum()

    # Compute the expected adjacency tensor for the masked elements
    lambda_poisson = compute_mean_lambda0(u,v,w)
    M = lambda_poisson[mask > 0]
    logM = np.zeros(M.shape)
    logM[M > 0] = np.log(M[M > 0])
    return (B[mask > 0] * logM).sum() - M.sum() - (np.log(factorial(B[mask > 0].astype(int)))).sum()


In [None]:
logL_test = [np.round(get_loglikelihood(data.adjacency_tensor,v[0],v[1],v[2],mask = cv_mask[fold]),3) for a,v in params_cv.items()]
logL_train = [np.round(get_loglikelihood(data.adjacency_tensor,v[0],v[1],v[2],mask = np.logical_not(cv_mask[fold])),3) for a,v in params_cv.items()]
bce_test = [np.round(log_loss(data.adjacency_tensor[cv_mask[fold]],y_pred[cv_mask[fold]]),3) for a,y_pred in Y_pred.items()]
bce_train = [np.round(log_loss(data.adjacency_tensor[np.logical_not(cv_mask[fold])],y_pred[np.logical_not(cv_mask[fold])]),3) for a,y_pred in Y_pred.items()]

df_holl = pd.DataFrame({'algo':[a for a in params_cv.keys()],'logL_train':logL_train,'logL_test':logL_test,
                        'bce_train':bce_train,'bce_test':bce_test
                       })

df_auc = df_auc.merge(df_holl,on='algo')
# df_auc.rename(columns={'auc':'auc_train'},inplace=True)
df_auc

#### How can we make results better?

# 4. Adding node attributes
We can add extra information (if available) and see if this makes results better.  
For instance, we can use information about the country as what countries are in some agreement, e.g. are in OECD.  
**Idea**: if this extra information is correlated with community structure, it can help the algorithm finding a better partition.  
This should be particularly helpful in the presence of sparse data, where there is not much information to start with. 

Here we use the python library [`country_converter`](https://github.com/IndEcol/country_converter), which allows retrieving various country-level information easily. 

In [None]:
import country_converter as coconv

## 4.1 Attribute processing

First let's check some example grouping based on official agreements.

And extract the list of countries in a grouping

In [None]:
cc = coconv.CountryConverter()

In [None]:
cc.valid_class#[:20]

In [None]:
ref_group = 'OECD'
# ref_group = 'EU28'
assert ref_group in cc.valid_class, f"{ref_group} not found in {cc.valid_class}!"

gt_groups = list(eval(f'cc.{ref_group}.name_short.unique()'))
gt_groups.append('European Union')
gt_groups

We need to convert the names in the dataframe to the same naming as in this extra information, using `name_short`

In [None]:
names_short = coconv.convert(names=data.nodes, to='name_short',not_found=None)
nameRaw2Short = {data.nodes[i]: names_short[i] for i in range(len(names_short))}

We are ready to build a **node attribute** stating what countries are in the reference agreement 

In [None]:
C = 2 # if 2: binary
X = np.zeros((len(data.nodes),C)).astype(int)
# X = np.zeros((len(data.nodes), 4)) # uncomment this if you want to give dummy data
data_cv = data_cv._replace(design_matrix=X)
for i,n in enumerate(data.nodes):
    if nameRaw2Short[n] in gt_groups:
        X[i,0] = 1
    else:
        X[i,1] = 1
        
assert np.all(np.sum(X,axis=1) == 1)

## 4.2 Run MTCOV with valid attribute

This time we do a cycle over all folds and then take the mean. We use the utils functions inside `cv_tools.py`

In [None]:
import cv_tools as cvtl

In [None]:
gammas = [0.0,0.3,0.5,0.7,0.9,0.99]

In [None]:
plot_loglik = False
num_realizations = 20
max_iter = 500
decision = 1
convergence_tol = 1e-3
data = data._replace(design_matrix=X)

model = MTCOV(num_realizations=num_realizations, plot_loglik=plot_loglik,max_iter=max_iter,decision=decision,convergence_tol=convergence_tol)

In [None]:
params_cv =  { f: {} for f in cv_mask.keys()}

In [None]:
X_meta = np.copy(X)
attrib_label = 'OECD'
# X_meta = np.copy(X_UN)
# attrib_label = 'UNRegion'

K = 6
for fold, mask in cv_mask.items():
    for gamma in gammas:
        data_cv = cvtl.get_df_train_test(df,data,cv_mask,fold=fold)
        data_cv = data_cv._replace(design_matrix=X_meta)
        params_cv[fold][gamma] = model.fit(data_cv, K=K, gamma=gamma, rng=np.random.default_rng(config_dict["rseed"]), **config_dict)


In [None]:
params_cv.keys()

## 4.3 Evaluate performance
We follow the same steps as before. Now we use functions inside `cv_tools.py` to extract these quickly

In [None]:
df_pred = pd.concat([cvtl.get_prediction_results(data, params_cv[fold], cv_mask,fold=fold) for fold in cv_mask.keys()])
df_pred.head(n=10)

In [None]:
df_pred_mean = df_pred.groupby(by='algo').agg('mean').drop(columns=['fold']).reset_index()
df_pred_std = df_pred.groupby(by='algo').agg('std').drop(columns=['fold']).reset_index()

metrics = ['auc_test', 	'logL_test', 	'bce_test']
df_pred_mean.style.background_gradient(subset=metrics,cmap=plt.cm.RdYlGn)

In [None]:
df_pred_std

Colors are fine, but we can find a more intuitive visualization to highlight the best performing model

In [None]:
c = viz.default_colors[6]
L = len(metrics)

xticks = np.arange(len(gammas))

fig, axs = plt.subplots(1,L,figsize=(12,4),sharex=True)
for i in range(L):
    m = metrics[i]
    axs[i].scatter(xticks,df_pred_mean[m],s=200,color=c, edgecolor='black')
    axs[i].errorbar(xticks,df_pred_mean[m],yerr=df_pred_std[m], linewidth=1, capsize=4, capthick=1, color=c)
    axs[i].set_xlabel('Model')
    axs[i].set_ylabel(m)
    axs[i].set_xticks(xticks,gammas)
plt.tight_layout()


# filename = tl.get_filename(f'wto_cv_example', lecture_id=lecture_id)
filename = None
tl.savefig(plt, outfile=filename, outdir=outdir_fig)


If error bars are too high, an alternative visualization is to compare on a fold-by-fold basis

In [None]:
df_pred.algo.unique()

In [None]:
c = viz.default_colors[6]

m = 'auc_test'
method1 = 0.7
methods = list(set(df_pred.algo.unique()).difference(set([method1])))
L = len(methods)

xlim = (df_pred[m].min() * 0.9,df_pred[m].max() * 1.05)
mask1 = df_pred.algo == method1
y_ref = df_pred[mask1].reset_index()

fig, axs = plt.subplots(1,L,figsize=(15,3),sharex=True)
for i in range(L):
    mask2 = df_pred.algo == methods[i]
    y_comp = df_pred[mask2].reset_index()

    # mask_tot = mask1 & mask2
    mask_c = y_ref[m] >= y_comp[m]
    if np.sum(mask_c) > 0:
        axs[i].scatter(y_ref[m][mask_c],y_comp[m][mask_c],s=100,c='b', edgecolor='black')
        axs[i].scatter(y_ref[m][mask_c==False],y_comp[m][mask_c==False],s=100,c='r', edgecolor='black')
    else:
        axs[i].scatter(y_ref[m],y_comp[m],s=100,c='r', edgecolor='black')
    axs[i].set_xlabel(f"{m} {method1}")
    axs[i].set_ylabel(f"{m} {methods[i]}")

    axs[i].set_xlim(xlim)
    axs[i].set_ylim(xlim)

    xs = np.linspace(xlim[0],xlim[1])
    axs[i].plot(xs,xs,ls='--',alpha=0.8, color='darkgrey',lw=1)

plt.tight_layout()

# filename = tl.get_filename(f'wto_cv_example_fold_by_fold', lecture_id=lecture_id)
filename = None
tl.savefig(plt, outfile=filename, outdir=outdir_fig)

In [None]:
len(params_cv[0].keys()),params_cv[0].keys()

And we can now visualize a particular partition to train the model with the full data.

We select the model that performs the best. 

In [None]:
u_cv.keys()

In [None]:
config_dict = {
    "assortative": True,
    "end_file": "_mtcov",
    "out_folder": '../../../data/outdir/wto/',
    "out_inference": True,
    "undirected": True,
    "rseed": 10
}

In [None]:
plot_loglik = True
num_realizations = 100
max_iter = 1000
decision = 2
convergence_tol = 1e-4
data = data._replace(design_matrix=X_meta)

gamma = 0.7
for gamma in [0.0,0.7]:
    model = MTCOV(num_realizations=num_realizations, plot_loglik=plot_loglik,max_iter=max_iter,decision=decision,convergence_tol=convergence_tol)
    params[gamma] = model.fit(data, K=K, gamma=gamma, rng=np.random.default_rng(config_dict["rseed"]), **config_dict)
    
    algo = f'mtcov_{gamma}_{ref_group}'
    u[algo] = params[gamma][0]

In [None]:
u[algo].shape, u.keys()

In [None]:
# filename0 = f'WTO_network_{algo}_{plot_labels}'

In [None]:
figsize= (16,10)

fig, axs = plt.subplots(1,3, figsize=(16,6))

viz.plot_network(data,X_meta,ax=axs[0], title=f'Attribute {attrib_label}')
viz.plot_network(data,u['mtcov_0.7_OECD'],ax=axs[1], title = r'$\gamma=0.7$')
viz.plot_network(data,u['mtcov_0.0_OECD'],ax=axs[2], title = 'No attributes')


filename = tl.get_filename(f'wto_attribute_{attrib_label}', lecture_id=lecture_id)
filename = None
tl.savefig(plt, outfile=filename, outdir=outdir_fig)

## 4.4 Other attributes

Try with some other attribute!  

For instance, `cc.UNregion` assigns one geographic macro area to each country.

In [None]:
cc.UNregion.head()

In [None]:
macro_area = cc.UNregion['UNregion'].unique()
macro_area

In [None]:
nameShort2UNregion = dict(zip(cc.UNregion['name_short'],cc.UNregion['UNregion']))
nameShort2UNregion['European Union'] = 'Western Europe'

In [None]:

C = len(macro_area) + 1 # if 2: binary
X_UN = np.zeros((len(data.nodes),C)).astype(int)

for i,n in enumerate(data.nodes):
    if nameRaw2Short[n] in nameShort2UNregion:
        r = nameShort2UNregion[nameRaw2Short[n]]
        idx = np.where(macro_area ==r)[0]
        X_UN[i,idx] = 1
    else:
        print(n)
        X_UN[i,-1] = 1
        
assert np.all(np.sum(X_UN,axis=1) == 1)

Now go back in the previous cells and use `X_UN` as node attribute. What do you observe?

# Appendix

Build other attributes from this dataset

In [None]:
macro_area = cc.continent['continent'].unique()
macro_area

In [None]:
nameShort2region = dict(zip(cc.continent['name_short'],cc.continent['continent']))
nameShort2region['European Union'] = 'Europe'

In [None]:

C = len(macro_area) + 1 # if 2: binary
X_reg = np.zeros((len(data.nodes),C)).astype(int)

for i,n in enumerate(data.nodes):
    if nameRaw2Short[n] in nameShort2region:
        r = nameShort2region[nameRaw2Short[n]]
        idx = np.where(macro_area ==r)[0]
        X_reg[i,idx] = 1
    else:
        print(n)
        X_reg[i,-1] = 1
        
assert np.all(np.sum(X_reg,axis=1) == 1), np.where(np.sum(X_reg,axis=1) != 1)
X_reg.shape

In [None]:
plot_labels = False
filename = f'wto_x_attributes_{plot_labels}'

In [None]:
figsize= (16,10)


fig, axs = plt.subplots(1,3, figsize=(12,4))

# plot_network(data,u[algo],ax=axs[0])
viz.plot_network(data,X,ax=axs[0], title='OECD',plot_labels=plot_labels)
viz.plot_network(data,X_UN,ax=axs[1],title='UNregion',plot_labels=plot_labels)
viz.plot_network(data,X_reg,ax=axs[2],title='Continent', filename=filename, lecture_id=lecture_id,outdir=outdir_fig,plot_labels=plot_labels)