# Wilcoxon test and raw stats extraction from tl123 data

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

## Load data

In [None]:
network_type = "tl123_edges"

df_merged = pd.DataFrame()
edas = {}

for i in range(1, 4, 1):
    eda_path = Path(f"analysis/{network_type}/{i}_eda.csv")
    eda = pd.read_csv(eda_path, index_col=0)
    edas[f"{i}"] = eda
    df_merged = pd.concat([df_merged, eda], ignore_index=True)

edas["concatenated"] = df_merged


## Select columns to be used in the test

In [None]:
investigated_cols = [
    "metric_level gained",
    "interact_classmates",
    "metric_Gender",
    "interaction_groupintegration",
    "psycho_motivationdegree",
    "psycho_proficiencyingroup_BAL1",
    "betweenness_weighted",
    "pagerank_weighhted",
    "voterank",
]

edas = {n: e[investigated_cols] for n, e in edas.items()}

## Clean & normalise data

In [None]:
for n, e in edas.items():
    print(f"{n}\n\n\n")
    cols_with_nans = []
    for col in e.columns:
        if e[col].isnull().values.any():
            print(col, e[col].unique())
            cols_with_nans.append(col)
    edas[n] = e.fillna({col: 0 for col in cols_with_nans})

In [None]:
for e in edas.values():
    print(e.dtypes)

In [None]:
for n, e in edas.items():
    _df = e.loc[:, e.columns != "metric_Gender"]
    _df=(_df-_df.min())/(_df.max()-_df.min())
    df_norm = pd.concat([e["metric_Gender"], _df], axis=1)
    edas[n] = df_norm

In [None]:
for n, e in edas.items():
    print(n)
    e.hist()

As we can see - distributions certainly don't come from normal distribution

## Split by gender

In [None]:
def split_by_gender(df):
    df_female = df.loc[df["metric_Gender"] == "female"].drop("metric_Gender", axis=1)
    df_male = df.loc[df["metric_Gender"] == "male"].drop("metric_Gender", axis=1)
    assert len(df_female) + len(df_male) == len(df)
    return {"female": df_female, "male": df_male}

edas_split = {name: split_by_gender(eda) for name, eda in edas.items()}

## Perform Wilcoxon test

In [None]:
import numpy as np

from scipy.stats import ranksums

#correct if the population S.D. is expected to be equal for the two groups.
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (
        np.mean(x) - np.mean(y)) / np.sqrt(((nx-1) * np.std(x, ddof=1) ** 2 + (ny-1) * np.std(y, ddof=1) ** 2) / dof
    )

def test_samples(x: np.ndarray, y: np.ndarray) -> dict:
    result = ranksums(x=x, y=y, alternative="two-sided")
    # cd = cohen_d(x, y)
    return {"p_value": result.pvalue, "statistic": result.statistic} # , "cohen_d": cd}

def compute_stats(x: np.ndarray) -> dict:
    return {"mean": x.mean(), "std": x.std(), "size": len(x)}

In [None]:
wilcoxon_rank_sum_test = []

for snapshot, eda in edas_split.items():
    for col in eda["female"].columns:
        f_feature = eda["female"][col].to_numpy()
        m_feature = eda["male"][col].to_numpy()
        stats = test_samples(f_feature, m_feature)
        wilcoxon_rank_sum_test.append({"snapshot": snapshot, "feature": col, **stats})

pd.DataFrame(wilcoxon_rank_sum_test).to_csv(f"analysis/{network_type}/wilcoxon_rank_sum_test.csv")

In [None]:
raw_stats = []

for snapshot, eda in edas_split.items():
    for gender in ["female", "male"]:
        for col in eda[gender].columns:
            f_feature = eda[gender][col].to_numpy()
            raw_stats.append(
                {"snapshot": snapshot, "feature": col, "gender": gender, **compute_stats(f_feature)}
            )

pd.DataFrame(raw_stats).to_csv(f"analysis/{network_type}/raw_stats.csv")