# Prelimaries & Setup


In [None]:
import ast
import os
import random
import tempfile

from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
from teeplot import teeplot as tp
from tqdm import tqdm
from scipy import stats as scipy_stats
import seaborn as sns


In [None]:
random.seed(1)
np.random.seed(1)


In [None]:
nbm.print_metadata()


# Fetch Data


In [None]:
df = pd.read_csv("https://osf.io/45b6h/download", compression="gzip")
df


In [None]:
dfdigest = np.bitwise_xor.reduce(
    pd.util.hash_pandas_object(df),
)
print("{:x}".format(dfdigest))


In [None]:
# subset data to keep exploratory analyses tractable
df = df[(df["update"] == 20000) & (df["TEST_DOWN_SAMPLE_RATE"] == 0.5)]
df = df.sample(frac=0.2)


# Convert Data Vectors to Long-Form

i.e., each trait of each individual is a single row


In [None]:
vector_columns = (
    #     "genome",
    "phenotype",
    "traits_attempted_estimations",
    "traits_estimation_dist",
    "traits_estimated_scores",
    "traits_estimation_source_ids",
    "traits_evaluated",
    "traits_successful_estimations",
)


In [None]:
for column in tqdm(vector_columns):
    print(column)
    df[column] = df[column].apply(ast.literal_eval)


In [None]:
df["vector len"] = df["traits_evaluated"].apply(len)
for column in tqdm(vector_columns):
    print(column)
    assert (df[column].apply(len) == df["vector len"]).all()


In [None]:
df["vector index"] = df["vector len"].apply(lambda x: [*range(x)])


In [None]:
df["num traits evaluated"] = df["traits_evaluated"].apply(sum)
df["num attempted trait estimations"] = df["traits_attempted_estimations"].apply(sum)
df["num successful trait estimations"] = df["traits_successful_estimations"].apply(sum)


In [None]:
(df["num successful trait estimations"] == df["num attempted trait estimations"]).all()


In [None]:
# chunk to prevent running out of memory
chunk_size = 32768
num_chunks = (len(df) + chunk_size - 1) // chunk_size

exploded_chunks = []
for i in tqdm(range(num_chunks)):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(df))
    chunk = df.iloc[start_idx:end_idx]

    exploded_chunk = chunk.explode(["vector index", *vector_columns])
    exploded_chunks.append(exploded_chunk)


In [None]:
# concatenate chunks to single csv, delete from memory, then reload from csv
with tempfile.NamedTemporaryFile(mode="w") as tmpfile:
    for i, exploded_chunk in enumerate(tqdm(exploded_chunks)):
        kwargs = {"index": False, "chunksize": 4096}
        if i:
            kwargs["mode"] = "a"
            kwargs["header"] = False
        exploded_chunk.to_csv(tmpfile.name, **kwargs)
        tmpfile.flush()

    del exploded_chunks
    exploded_df = pd.read_csv(tmpfile.name)

exploded_df


# Setup New Columns needed for Analyses


In [None]:
exploded_df["trait estimation error"] = (
    exploded_df["phenotype"] - exploded_df["traits_estimated_scores"]
)
exploded_df["trait estimation abs error"] = exploded_df["trait estimation error"].abs()


In [None]:
exploded_df["is evaluated"] = exploded_df["traits_evaluated"] == 1
exploded_df["is evaluated"].sum()


In [None]:
exploded_df["is attempted estimation"] = (
    exploded_df["traits_attempted_estimations"] == 1
)
exploded_df["is attempted estimation"].sum()


In [None]:
exploded_df["is successful estimation"] = (
    exploded_df["traits_successful_estimations"] == 1
)
exploded_df["is successful estimation"].sum()


In [None]:
exploded_df["is failed estimation"] = exploded_df["traits_successful_estimations"] == 0
exploded_df["is failed estimation"].sum()


In [None]:
assert not (
    exploded_df["is successful estimation"] & exploded_df["is failed estimation"]
).any()


In [None]:
exploded_df["estimation outcome"] = "na"
exploded_df.loc[
    exploded_df["is failed estimation"],
    "estimation outcome",
] = "failed"
exploded_df.loc[
    exploded_df["is successful estimation"],
    "estimation outcome",
] = "successful"


# Plot: estimation mode vs estimation failure


In [None]:
g = sns.FacetGrid(
    exploded_df,
    col="EVAL_FIT_EST_MODE",
    row="DIAGNOSTIC",
    margin_titles=True,
)
g.map(
    sns.countplot,
    "estimation outcome",
)


# Plot: estimated score vs phenotype

by diagnostic and estimate mode


In [None]:
g = sns.FacetGrid(
    exploded_df[
        #         :
        exploded_df["is successful estimation"]
    ],
    col="DIAGNOSTIC",
    row="EVAL_FIT_EST_MODE",
    margin_titles=True,
)
g.map(
    sns.scatterplot,
    "phenotype",
    "traits_estimated_scores",
)


# Plot: trait estimation error vs phylogenetic distance

for each diagnostic and estimation mode


In [None]:
g = sns.FacetGrid(
    exploded_df[
        #         :
        exploded_df["is successful estimation"]
        & (exploded_df["traits_estimation_dist"] > 0)
    ],
    col="DIAGNOSTIC",
    row="EVAL_FIT_EST_MODE",
    margin_titles=True,
    sharex=False,
    sharey=False,
)
g.map(
    sns.regplot,
    "traits_estimation_dist",
    "trait estimation abs error",
    n_boot=10,
    scatter_kws={
        "color": "red",
        "alpha": 0.1,
    },
)


# Plot: mean estimation error by diagnostic/estimation mode


In [None]:
g = sns.FacetGrid(
    exploded_df[
        #         :
        exploded_df["is successful estimation"]
        & (exploded_df["traits_estimation_dist"] > 0)
    ],
    row="DIAGNOSTIC",
    margin_titles=True,
    sharey="row",
    sharex=False,
)
g.map(
    sns.barplot,
    "EVAL_FIT_EST_MODE",
    "trait estimation abs error",
)


# Plot: phylogenetic estimation distance by diagnostic/estimation mode


In [None]:
g = sns.FacetGrid(
    exploded_df[
        #         :
        exploded_df["is successful estimation"]
        & (exploded_df["traits_estimation_dist"] > 0)
    ],
    row="DIAGNOSTIC",
    margin_titles=True,
    sharey="row",
    sharex=False,
)
g.map(
    sns.barplot,
    "EVAL_FIT_EST_MODE",
    "traits_estimation_dist",
)


In [None]:
g = sns.FacetGrid(
    exploded_df[exploded_df["is attempted estimation"]],
    row="DIAGNOSTIC",
    margin_titles=True,
    sharey="row",
    sharex=False,
)
g.map(
    sns.barplot,
    "EVAL_FIT_EST_MODE",
    "is successful estimation",
)

print(
    sum(
        exploded_df["is attempted estimation"] & exploded_df["is successful estimation"]
    )
)


# Plot: error distributions

by diagnostic/evaluation mode


In [None]:
g = sns.FacetGrid(
    exploded_df[exploded_df["is successful estimation"]],
    col="DIAGNOSTIC",
    margin_titles=True,
    sharex=False,
)
g.map(
    sns.violinplot,
    "trait estimation abs error",
    "EVAL_FIT_EST_MODE",
)


# Statistics (TODO)


In [None]:
groupby_columns = [
    "EVAL_MODE",
    "TEST_DOWN_SAMPLE_RATE",
    "DIAGNOSTIC",
    "EVAL_FIT_EST_MODE",
    "update",
]
for group, group_df in exploded_df.groupby(groupby_columns):
    group_attrs = dict(zip(groupby_columns, group))
    print(group_attrs)
