In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sns

In [None]:
out_path = Path("networks")
out_path.mkdir(exist_ok=True, parents=True)
df_path = "data/whitby_03_07_Peer Interaction in SA_Jordan_22_Questionnaire 1_2_3_total data set.csv"
snapshot = 3  # <- change this to obtain particulat snapshot

In [None]:
with open(df_path) as f:
    _cols = f.readline()[1:-2]
_cols = _cols.split('","')
dtype_mapper = {c: str for c in _cols if ("net@" in c and "choice" not in c)}

df = pd.read_csv(df_path, converters = dtype_mapper)
df = df.drop(df.loc[df["group_questionnaire_order"] != snapshot].index)
df = df.set_index("metric_id")

df.head()

In [None]:
edge_ego_cols = {col for col in df.columns if ("net" in col and "choice" in col)}
edge_course_cols = {col for col in df.columns if "net" in col}.difference(edge_ego_cols)
node_cols = set(df.columns).difference(edge_course_cols).difference(edge_ego_cols)

assert len(edge_ego_cols) + len(edge_course_cols) + len(node_cols) == len(df.columns)

In [None]:
edge_ego_df = df[list(edge_ego_cols)]
edge_course_df = df[list(edge_course_cols)]
node_df = df[list(node_cols)]

## Obtain weighted adjacency matrix for 'ego' network

In [None]:
ego_edge_list = []

for node, row in edge_ego_df.iterrows():
    for choice_level, choice_node in row.items():
        try:
            weight = int(choice_level[4])  # e.g. net@2ndchoice -> 2
            choice_node = int(choice_node)
            ego_edge_list.append({"source": node, "target": choice_node, "weight": weight})
        except:
            print(node, choice_node, choice_level)

ego_edge_df = pd.DataFrame(ego_edge_list)

In [None]:
ego_edge_df["weight"] =  ego_edge_df["weight"].max() - ego_edge_df["weight"] + 1
ego_edge_df["weight"] =  ego_edge_df["weight"] / ego_edge_df["weight"].max()
ego_edge_df["weight"].unique()
# 1, 2, 3, 4, 5
# 5, 4, 3, 2, 1 
# 1, 0.8, 0.6, 0.4, 0.2

In [None]:
ego_edge_df.to_csv(out_path.joinpath(f"{snapshot}_ego_edges.csv"))

## Obtain weighted edge list for course network

In [None]:
def decode_edge(code: str) -> dict:

    if len(code) != 3:
        print(code)
        return {}
    try:
        int(code)
    except:
        print(code)
        return {}
    
    if code[0] == "1":
        direction = "out"
    elif code[0] == "2":
        direction = "in"
    elif code[0] == "3":
        direction = "mutual"
    else:
        print(code)
        return {}
    intensity = int(code[1])  # not yet normalised! 
    lang_usage = int(code[2]) # not yet normalised!

    return {"direction": direction, "intensity": intensity, "lang_usage": lang_usage}

In [None]:
course_edge_list = []

for source_node, row in edge_course_df.iterrows():
    for target_node, code in row.items():
        decoded_edge = decode_edge(str(code))
        if len(decoded_edge) == 0:
            continue
        course_edge_list.append({"source": source_node, "target": int(target_node[4:]), **decoded_edge})

edge_course_df = pd.DataFrame(course_edge_list)

In [None]:
# debugging - sanity chech whether decoded edges are in assumed ranges
sns.histplot(edge_course_df["intensity"])  # should be [0, 5]
sns.mpl.pyplot.show()
sns.histplot(edge_course_df["lang_usage"])  # should be [0, 9]


In [None]:
# normalise intensity and lang_usage
edge_course_df["intensity"] = edge_course_df["intensity"].map(lambda x: max(0, min(5, x)))
sns.histplot(edge_course_df["intensity"])
sns.mpl.pyplot.show()

edge_course_df["lang_usage"] = edge_course_df["lang_usage"].map(lambda x: max(0, min(9, x)))
sns.histplot(edge_course_df["lang_usage"])

In [None]:
edge_course_df.to_csv(out_path.joinpath(f"{snapshot}_course_edges.csv"))

## Obtain node list - doesn't work yet!

In [None]:
# remove columns that don't add any information, i.e. each node has the same value for these columns
garbage_attrs = []
for col in node_df.columns:
    unique_vals = node_df[col].unique()
    if len(unique_vals) == 1:
        garbage_attrs.append(col)
garbage_attrs.sort()

# save names of renamed columns for further sainty check
ga_path = out_path / "trash"
ga_path.mkdir(exist_ok=True, parents=True)
_ = {ga: str(node_df[ga].unique()[0]) for ga in garbage_attrs}
pd.DataFrame({snapshot: _}).to_csv(ga_path / f"{snapshot}_removed_columns.csv")

print(f"removing {len(garbage_attrs)} columns")
node_df = node_df.drop(garbage_attrs, axis=1)

In [None]:
node_df = node_df.reset_index().rename({"metric_id": "node_id"}, axis=1)
node_df.to_csv(out_path.joinpath(f"{snapshot}_nodes.csv"))

## Analyse if sets of removed columns are the same

Do it when all three snapshots are processed

In [None]:
trashed_cols_dfs = [pd.read_csv(tc, index_col=0) for tc in ga_path.glob("*.csv")]
trashed_cols_names = [set(tcd.index) for tcd in trashed_cols_dfs]

all_trashed_cols = set()
for nrc in trashed_cols_names:
    all_trashed_cols = all_trashed_cols.union(nrc)

diff_trashed_cols = set()
for nrc in trashed_cols_names:
    diff_trashed_cols = diff_trashed_cols.union(all_trashed_cols.difference(nrc))
print(f"Columns: {diff_trashed_cols} are not removed from all snapshots")

In [None]:
# merge all dataframes and check values from removed columns
merged_trashed_cols = pd.DataFrame()
for idx, tcd in enumerate(trashed_cols_dfs):
    merged_trashed_cols = pd.concat([merged_trashed_cols, tcd.T], ignore_index=True)

# sanity check
assert len(merged_trashed_cols.columns) == len(all_trashed_cols)

In [None]:
columns_dropped_unnecessarly = []
for col in merged_trashed_cols.columns:
    unique_vals = merged_trashed_cols[col].unique()
    if len(unique_vals) != 1:
        columns_dropped_unnecessarly.append(col)

columns_dropped_unnecessarly