# Preprocessing pipeline for CSV data
The aim of this notebook is to parse CSV file to produce a two layer ("ego" and "course") network that
consitsts of three snapshots (1, 2, 3)

In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sns

This is how we import improved raw dataset after rebuttal

In [None]:
out_path = Path("networks")
out_path.mkdir(exist_ok=True, parents=True)
df_path = "data/Amman_data+202+FLcumulativew-oAR.csv"

with open(df_path) as f:
    _cols = f.readline()[1:-1]
_cols = _cols.split(';')
dtype_mapper = {c: str for c in _cols if ("net@" in c and "choice" not in c)}

df = pd.read_csv(df_path, converters = dtype_mapper, sep=';')
df = df.drop(df.loc[~df["group_questionnaire_order"].isin([1, 2, 3])].index)
df = df.set_index("metric_id")

df.head()

And this is how we used to import original dataset in the first version of manuscript

In [None]:
out_path = Path("networks")
out_path.mkdir(exist_ok=True, parents=True)
df_path = "data/whitby_03_07_Peer Interaction in SA_Jordan_22_Questionnaire 1_2_3_total data set.csv"

In [None]:
with open(df_path) as f:
    _cols = f.readline()[1:-2]
_cols = _cols.split('","')
dtype_mapper = {c: str for c in _cols if ("net@" in c and "choice" not in c)}

df = pd.read_csv(df_path, converters = dtype_mapper)
df = df.drop(df.loc[~df["group_questionnaire_order"].isin([1, 2, 3])].index)
df = df.set_index("metric_id")

df.head()

Following columns are the same

In [None]:
edge_ego_cols = {col for col in df.columns if ("net" in col and "choice" in col)}
edge_course_cols = {col for col in df.columns if "net" in col}.difference(edge_ego_cols)
node_cols = set(df.columns).difference(edge_course_cols).difference(edge_ego_cols)

assert len(edge_ego_cols) + len(edge_course_cols) + len(node_cols) == len(df.columns)

In [None]:
edge_ego_df = df[[*edge_ego_cols, "group_questionnaire_order"]]
edge_course_df = df[[*edge_course_cols, "group_questionnaire_order"]]
node_df = df[list(node_cols)]

## Obtain weighted adjacency matrix for 'ego' network

In [None]:
edge_ego_df.head()

In [None]:
ego_snapshot = 3  # <- change this to obtain particular snapshot!
_edge_ego_df = edge_ego_df.loc[edge_ego_df["group_questionnaire_order"] == ego_snapshot]
_edge_ego_df = _edge_ego_df.drop("group_questionnaire_order", axis=1)

In [None]:
ego_edge_list = []

for node, row in _edge_ego_df.iterrows():
    for choice_level, choice_node in row.items():
        try:
            weight = int(choice_level[4])  # e.g. net@2ndchoice -> 2
            choice_node = int(choice_node)
            ego_edge_list.append({"source": node, "target": choice_node, "weight": weight})
        except Exception as e:
            print(node, choice_node, choice_level, e)

ego_edge_df = pd.DataFrame(ego_edge_list)
ego_edge_df.head()

In [None]:
max_weight = ego_edge_df["weight"].max()
ego_edge_df["weight"] =  max_weight - ego_edge_df["weight"] + 1
ego_edge_df["weight"] =  ego_edge_df["weight"] / max_weight
ego_edge_df["weight"].unique()
# 1, 2, 3, 4, 5
# 5, 4, 3, 2, 1 
# 1, 0.8, 0.6, 0.4, 0.2

In [None]:
_out_path = out_path / "ego_edges"
ego_edge_df.to_csv(_out_path.joinpath(f"{ego_snapshot}_ego_edges.csv"))

## Obtain weighted edge list for course network

In [None]:
def decode_edge(code: str) -> dict:

    if len(code) != 3:
        print(code)
        return {}
    try:
        int(code)
    except:
        print(code)
        return {}
    
    if code[0] == "1":
        direction = "out"
    elif code[0] == "2":
        direction = "in"
    elif code[0] == "3":
        direction = "mutual"
    else:
        print(code)
        return {}
    intensity = int(code[1])  # not yet normalised! 
    lang_usage = int(code[2]) # not yet normalised!

    return {"direction": direction, "intensity": intensity, "lang_usage": lang_usage}

In [None]:
course_snapshot = 3  # <- change this to obtain particular snapshot!
_edge_course_df = edge_course_df.loc[edge_course_df["group_questionnaire_order"] == course_snapshot]
_edge_course_df = _edge_course_df.drop("group_questionnaire_order", axis=1)

In [None]:
course_edge_list = []

for source_node, row in _edge_course_df.iterrows():
    for target_node, code in row.items():
        decoded_edge = decode_edge(str(code))
        if len(decoded_edge) == 0:
            continue
        course_edge_list.append({"source": source_node, "target": int(target_node[4:]), **decoded_edge})

course_edge_df = pd.DataFrame(course_edge_list)

In [None]:
# debugging - sanity chech whether decoded edges are in assumed ranges
sns.histplot(course_edge_df["intensity"])  # should be [0, 5]
sns.mpl.pyplot.show()
sns.histplot(course_edge_df["lang_usage"])  # should be [0, 9]

In [None]:
# normalise intensity and lang_usage
course_edge_df["intensity"] = course_edge_df["intensity"].map(lambda x: max(0, min(5, x)))
sns.histplot(course_edge_df["intensity"])
sns.mpl.pyplot.show()

course_edge_df["lang_usage"] = course_edge_df["lang_usage"].map(lambda x: max(0, min(9, x)))
sns.histplot(course_edge_df["lang_usage"])

In [None]:
_out_path = out_path / "course_edges"
course_edge_df.to_csv(_out_path.joinpath(f"{course_snapshot}_course_edges.csv"))

## Obtain node list

We process here all snapshots at the same time

In [None]:
node_df["group_questionnaire_order"].unique()

In [None]:
# remove columns that don't add any information, i.e. each node has the same value for these columns
garbage_attrs = []
for col in node_df.columns:
    unique_vals = node_df[col].unique()
    if len(unique_vals) == 1:
        garbage_attrs.append(col)
garbage_attrs.sort()

# save names of renamed columns for further sainty check
_ = {ga: str(node_df[ga].unique()[0]) for ga in garbage_attrs}
pd.DataFrame({"unique_val": _}).to_csv(out_path / f"removed_columns.csv")

print(f"removing {len(garbage_attrs)} columns")
node_df = node_df.drop(garbage_attrs, axis=1)

In [None]:
node_df_1 = node_df.loc[node_df["group_questionnaire_order"] == 1]
node_df_1.head()

In [None]:
node_df_2 = node_df.loc[node_df["group_questionnaire_order"] == 2]
node_df_2.head()

In [None]:
node_df_3 = node_df.loc[node_df["group_questionnaire_order"] == 3]
node_df_3.head()

In [None]:
from typing import List


def obtain_epmty_columns(df: pd.DataFrame) -> List[str]:
    cols_to_drop = []
    for col in df.columns:
        unique_vals = df[col].unique()
        if len(unique_vals) == 1 and pd.isna(unique_vals[0]):
            cols_to_drop.append(col)
    return cols_to_drop


def replace_nans(node_df_x: pd.DataFrame, node_df_reference: pd.DataFrame, idx_col: str):
    cols_with_nans = obtain_epmty_columns(node_df_x)
    _node_df_x  = node_df_x.drop(cols_with_nans, axis=1)
    _node_df_reference = node_df_reference[cols_with_nans]
    return _node_df_x.join(_node_df_reference, on=idx_col, how="left")

In [None]:
node_df_2 = replace_nans(node_df_2, node_df_1, "metric_id")
node_df_2 = node_df_2.sort_index(axis=1).reset_index().rename({"metric_id": "node_id"}, axis=1)
node_df_2.head()

In [None]:
node_df_3 = replace_nans(node_df_3, node_df_1, "metric_id")
node_df_3 = node_df_3.sort_index(axis=1).reset_index().rename({"metric_id": "node_id"}, axis=1)
node_df_3.head()

In [None]:
node_df_1 = node_df_1.sort_index(axis=1).reset_index().rename({"metric_id": "node_id"}, axis=1)
node_df_1.head()

In [None]:
node_df_1.to_csv(out_path.joinpath("1_nodes.csv"))
node_df_2.to_csv(out_path.joinpath("2_nodes.csv"))
node_df_3.to_csv(out_path.joinpath("3_nodes.csv"))

## Enhance node list by additional data (housing and their pre-exam results)

This is done for each snaphsot separately (not obligatory for raw data after rebuttal)

In [None]:
final_scores = pd.read_csv(
    "data/2022 Issues Classes and Final Exam scores(80).csv",
    sep=";",
    decimal=",",
)
print(len(final_scores))
final_scores.head()

In [None]:
node_dfs = [("1_nodes.csv", node_df_1), ("2_nodes.csv", node_df_2), ("3_nodes.csv", node_df_3)]
out_name, node_df_x = node_dfs[2]  # <- select this manually

In [None]:
merged_nodes = node_df_x.merge(
    final_scores, how="left", left_on="metric_Name", right_on="student_name"
).drop("student_name", axis=1)

print(len(merged_nodes))
merged_nodes.head()

# merged_nodes.to_csv(out_path.joinpath(out_name))