In [30]:
import pandas as pd

In [31]:
df_path = "data/whitby_01_00_byu_Jordan_23.csv"

with open(df_path) as f:
    _cols = f.readline()[1:-2]
_cols = _cols.split('","')
dtype_mapper = {c: str for c in _cols if ("net@" in c and "choice" not in c)}

df = pd.read_csv(df_path, converters = dtype_mapper)
df = df.drop(df.loc[df["group_questionnaire_order"] != 1].index)
df = df.set_index("metric_id")

In [32]:
edge_ego_cols = {col for col in df.columns if ("net" in col and "choice" in col)}
edge_course_cols = {col for col in df.columns if "net" in col}.difference(edge_ego_cols)
node_cols = set(df.columns).difference(edge_course_cols).difference(edge_ego_cols)

In [33]:
edge_ego_df = df[list(edge_ego_cols)]
edge_course_df = df[list(edge_course_cols)]
node_df = df[list(node_cols)]

## Obtain weighted adjacency matrix for ego network

In [34]:
ego_edge_list = []

for node, row in edge_ego_df.iterrows():
    for choice_level, choice_node in row.items():
        try:
            weight = int(choice_level[4])  # e.g. net@2ndchoice -> 2
            choice_node = int(choice_node)
            ego_edge_list.append({"source": node, "target": choice_node, "weight": weight})
        except:
            print(node, choice_node, choice_level)

pd.DataFrame(ego_edge_list).to_csv("ego_edge_list.csv")  # TODO: RENAME!

3 nan net@3rdchoice
3 nan net@2ndchoice
3 nan net@5thchoice
3 nan net@1stchoice
3 nan net@4thchoice
8 nan net@5thchoice
10 nan net@5thchoice
16 nan net@5thchoice
16 nan net@4thchoice
20 nan net@5thchoice
20 nan net@4thchoice
21 nan net@5thchoice
21 nan net@4thchoice
23 nan net@5thchoice
25 nan net@5thchoice
26 nan net@5thchoice
30 nan net@5thchoice
30 nan net@4thchoice
31 nan net@5thchoice
36 nan net@5thchoice
38 nan net@5thchoice
38 nan net@4thchoice
41 nan net@5thchoice
41 nan net@4thchoice


## Obtain weighted edge list for course network

In [35]:
def decode_edge(code: str) -> dict:

    if len(code) != 3:
        print(code)
        return {}
    try:
        int(code)
    except:
        print(code)
        return {}
    
    if code[0] == "1":
        direction = "out"
    elif code[0] == "2":
        direction = "in"
    elif code[0] == "3":
        direction = "mutual"
    else:
        print(code)
        return {}
    intensity = int(code[1]) / 5  # normalised
    lang_usage = int(code[2]) / 9  # normalised

    return {"direction": direction, "intensity": intensity, "lang_usage": lang_usage}

In [36]:
course_edge_list = []

for source_node, row in edge_course_df.iterrows():
    for target_node, code in row.items():
        decoded_edge = decode_edge(str(code))
        if len(decoded_edge) == 0:
            continue
        course_edge_list.append({"source": source_node, "target": int(target_node[4:]), **decoded_edge})

pd.DataFrame(course_edge_list).to_csv("course_edge_list.csv")  # TODO: RENAME!

x?
self
?
x2x
xxx
xxx
xxx
xxx
self
xxx
xxx
xxx
xxx
self
xxx
xxx
xxx
self
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
self
xxx
xxx
x20
xxx
xxx
xxx
self
xxx
xxx
xxx
23x
self
13x
xxx
x10
self
xxx
xxx
xxx
x10
.320
x10
x30
x10
x10
xxx
xxx
xxx
self
x52
self
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
030
xxx
xxx
xxx
xxx
self
xxx
xxx
xxx
self
xxx
xxx
xxx
xxx
xxx
xxx
self
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
x27
xxx
xxx
self
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
x23
xxx
xxx
xxx
self
xxx
xxx
xxx
self
xxx
self
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
xxx
self
xxx
xxx
self
xxx
x20
x10
xxx
xxx
self
x10
x10
x33
xxx
x10
x22
xxx
xxx
x10
xxx
xxx
x10
x10
x10
xxx
x10
xxx
xxx
x10
xxx
x22
x10
xxx
self
self
x20
030
x52
xxx
x10
x10
selt
xxx
x43
x20
x21
022
xxx
x10
xxx
self
xxx
xxx
xxx
xxx
x10
xxx
xxx
xxx
xxx
x30
self
xxx
xxx
xxx
xxx
x20
xxx
x10
xxx
x20
xxx
xxx
xxx
xxx
self
x52
xxx
xxx
xxx
xxx
xxx
x52
xxx
xxx
x52
x10
xxx
xxx
xxx
xxx
x37
xxx
x37
x37
xxx
xxx
xxx
xxx
x37
xxx
x10
xxx
x32
self
xxx

## Obtain node list

In [37]:
# remove columns that don't add any information, i.e. each node has the same value for these columns

garbage_attrs = []
for col in node_df.columns:
    unique_vals = node_df[col].unique()
    if len(unique_vals) == 1:
        garbage_attrs.append(col)

print(f"removing {len(garbage_attrs)} columns")
node_df = node_df.drop(garbage_attrs, axis=1)

removing 36 columns


In [38]:
node_df = node_df.reset_index().rename({"metric_id": "node_id"}, axis=1)
node_df.to_csv("node_list.csv")