In [None]:
import copy
from pathlib import Path

import networkx as nx
import pandas as pd
import ydata_profiling

## Select paths and load data

In [None]:
network_type = "tl123_edges"
snapshot = 1
net_path = Path(f"networks/{network_type}/{snapshot}_{network_type}.csv")
nodes_path = Path(f"networks/{snapshot}_nodes.csv")

In [None]:
reports_path = Path(f"analysis/{network_type}")
reports_path.mkdir(exist_ok=True, parents=True)

In [None]:
nodes_df = pd.read_csv(nodes_path, index_col=0, sep=",")
nodes_df.head()

In [None]:
_edges = pd.read_csv(net_path, index_col=0)
# _edges = _edges.rename(columns={"lang_usage": "weight"})  # uncomment for coursefiltered_edges
_edges.head()

In [None]:
net = nx.from_pandas_edgelist(_edges, create_using=nx.DiGraph, edge_attr="weight")

In [None]:
len(net.edges), len(net.nodes)

In [None]:
nx.draw_networkx(net, pos=nx.spring_layout(net))  

## Centrality metrics computations

In [None]:
lst_metrics = []

_unweighted_net = copy.deepcopy(net)
for (n1, n2, d) in _unweighted_net.edges(data=True):
    d.clear()

lst_metrics.append({"in_degree_weighted": dict(net.in_degree(weight="weight"))})
lst_metrics.append({"in_degree": dict(net.in_degree())})

lst_metrics.append({"out_degree_weighted": dict(net.out_degree(weight="weight"))})
lst_metrics.append({"out_degree": dict(net.out_degree())})

lst_metrics.append({"degree_weighted": dict(net.degree(weight="weight"))})
lst_metrics.append({"degree": dict(net.degree())})

lst_metrics.append({"betweenness_weighted": nx.betweenness_centrality(net, weight="weight")})
lst_metrics.append({"betweenness": nx.betweenness_centrality(net)})

lst_metrics.append({"closeness": nx.closeness_centrality(net)})

lst_metrics.append({"pagerank_weighhted": nx.pagerank(net, weight="weight")})

lst_metrics.append({"pagerank": nx.pagerank(_unweighted_net)})

lst_metrics.append({"voterank": {n: idx for idx, n in enumerate(nx.voterank(net))}})

lst_metrics.append(
    {"clustering_coefficient": {n: cc for n, cc in nx.clustering(net).items()}}
)

In [None]:
dict_metrics = {k:v for lm in lst_metrics for k, v in lm.items()}
df_metrics = pd.DataFrame(dict_metrics)

df_metrics.head()

## Selection of node attrs taken into account

In [None]:
# select metrics to keep and to normalise (metric name, a standarisation factor)
improv_cols = {col: 5 for col in nodes_df.columns if "improv" in col}
metric_cols = {
    "metric_postsojournOPI": 11,
    "metric_presojournOPI": 11,
    "metric_level gained": 11,
    "metric_FLcumulativecompetence": 1,
    "metric_general_cumulativecompet": 1,
    "context_learningoutofclassminday": 1440,
    "psycho_motivationdegree": 5,
    "psycho_proficiencyingroup_BAL1": 3,
    "interact_classmates": 100,
    "interaction_groupintegration": 5,
    "202_final": 100,
    # dummy variables to check if features that should be constant through all snapshots are in fact constant
    "living_sum_flatmates": 1,
    "metric_general_cumulativecompet": 1,
    "metric_youngersiblings": 1,
}
investigated_node_attrs = pd.DataFrame()
analysed_cols = {**metric_cols, **improv_cols}

# add new metric
investigated_node_attrs["metric_TLuseoutofclass"] = (
    nodes_df["psycho_TLuseoutofclassminday"] / 
    (nodes_df["psycho_TLuseoutofclassminday"] + nodes_df["psycho_otherlgsminday"])
)
investigated_node_attrs = investigated_node_attrs.join(nodes_df["metric_Gender"])

# normalise metric_cols
for col, norm_coef in analysed_cols.items():
    # print(col, norm_coef)
    investigated_node_attrs = investigated_node_attrs.join(
        nodes_df[col].div(norm_coef, axis=0)
    )

# add two new metrics
investigated_node_attrs["metric_presojournOPI_delta"] = (
    investigated_node_attrs["metric_presojournOPI"] - investigated_node_attrs["202_final"]
)
investigated_node_attrs["metric_postsojournOPI_delta"] = (
    investigated_node_attrs["metric_postsojournOPI"] - investigated_node_attrs["202_final"]
)

investigated_node_attrs.head()

In [None]:
investigated_node_attrs.dtypes

## Merge both metrics

In [None]:
merged_df = pd.merge(investigated_node_attrs, df_metrics, left_index=True, right_index=True)
merged_df.to_csv(reports_path / f"{snapshot}_eda.csv")
merged_df.head()

## Compute correlations

In [None]:
report = ydata_profiling.ProfileReport(
    merged_df,
    title=f"EDA of snapshot {snapshot}",
    infer_dtypes=False,
    explorative=True,
    correlations={
        "auto": {"calculate": True},
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
    },
)
report.to_file(reports_path / f"{snapshot}_eda.html")