In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

In [None]:
import pandas as pd

In [None]:
input_path = "/Users/bsmith/Downloads/"

In [None]:
rename_mgen_map = {
    "CF_094": "CF_94",
    "CF_095": "CF_95",
    "CF_096": "CF_96",
    "CF_097": "CF_97",
    "CF_098": "CF_98",
    "CF_099": "CF_99",
    "CF_001": "CF_1",
    "CF_011": "CF_11",
    "CF_015": "CF_15",
    "CF_089": "CF_89",
}

def rename_mgen(curr_name):
    prefix, suffix = curr_name.split('_')
    suffix = int(suffix)
    normalized = f'{prefix}_{suffix:03d}'
    if normalized in rename_mgen_map:
        output = rename_mgen_map[normalized]
    else:
        output = normalized
    return output

In [None]:
subject_meta = (
    pd.read_csv(
        "raw/een-mgen/2023-07-28_Kolja.Siebert@med.uni-muenchen.de/1.0_Metadata_sharedwithByron_Shotgunmetagenomics_CRC1371Juli2023_datesfixed_sheet2.csv",
        parse_dates=False,
    )
    .rename(
        columns={
            "Patient_ID": "subject_id",
            "Start_date_EEN": "een_start_date",
            "End_date_EEN": "een_end_date",
            "Relapse1year": "endpoint_patient",
            "Date_Relapse_Start": "relapse_start_date",
        }
    )
    .set_index("subject_id")
    # .assign(
    #     relapse_start_date=lambda s: s.relapse_start_date.str.replace(
    #         ".", "/"
    #     ),  # One relapse date was written with '.' separators instead of '/'
    #     een_start_date=lambda s: s.een_start_date.str.replace(
    #         ".", "/"
    #     ),  # One start date was written with '.' separators instead of '/'
    # )
    .assign(
        een_start_date=lambda x: pd.to_datetime(x.een_start_date),
        een_end_date=lambda x: pd.to_datetime(x.een_end_date),
        relapse_start_date=lambda x: pd.to_datetime(x.relapse_start_date),
        endpoint_patient=lambda x: x.endpoint_patient.fillna("no"),
    )
    .assign(
        een_start_date_relative_een_end=lambda x: (
            x.een_start_date - x.een_end_date
        ).dt.days,
        relapse_start_date_relative_een_end=lambda x: (
            x.relapse_start_date - x.een_end_date
        ).dt.days,
    )
)

assert subject_meta.index.is_unique

subject_meta  # .apply(lambda x: x.value_counts().index.values)

In [None]:
# , names=[]

microcosm_data = (
    pd.read_csv(
        "raw/een-mgen/2023-07-28_Kolja.Siebert@med.uni-muenchen.de/1.0_Metadata_sharedwithByron_Shotgunmetagenomics_CRC1371Juli2023_datesfixed_sheet1.csv",
        skiprows=99,
        usecols=range(6),
        parse_dates=False,
    )
    .rename(
        columns={
            "Unnamed: 0": "mgen_id",
            "Unnamed: 1": "inoculum_subject_id",
            "used inocculum": "inoculum_mgen_id",
            "Unnamed: 3": "comments",
            "diet": "subject_diet",
            "Vessel": "replicate",
        }
    )
    .set_index("mgen_id")
    .assign(
        comments=lambda x: x.comments.fillna(""),
    )
    .rename(rename_mgen)
)

# Assertions
microcosm_data  # .reset_index().apply(lambda x: x.value_counts().index.values)

In [None]:
stool_data = (
    pd.read_csv(
        "raw/een-mgen/2023-07-28_Kolja.Siebert@med.uni-muenchen.de/1.0_Metadata_sharedwithByron_Shotgunmetagenomics_CRC1371Juli2023_datesfixed_sheet1.csv",
        nrows=98,
        parse_dates=False,
        usecols=range(8),
    )
    .rename(
        columns={
            "Seq-Name": "mgen_id",
            "Patient_ID": "subject_id",
            "sampleDate": "collection_date",
            "Group": "sample_group",
            "Diet_PreEEN_EEN_PostEEN": "sample_type",
            "pga": "patient_status_specific",
            "active_inactive": "patient_status",
            "HealingGroup": "endpoint_patient",
        }
    )
    .dropna(subset=["subject_id"])
    .set_index("mgen_id")
    .assign(collection_date=lambda x: pd.to_datetime(x.collection_date))
    .join(subject_meta.een_end_date, on="subject_id")
    .assign(
        collection_date_relative_een_end=lambda x: (
            x.collection_date - x.een_end_date
        ).dt.days
    )
    .rename(rename_mgen)
)

# Assertions
assert stool_data.index.is_unique

stool_data#.reset_index().apply(lambda x: x.value_counts().index.values)

In [None]:
subject_meta[['een_start_date', 'een_end_date', 'relapse_start_date']].to_csv('meta/een-mgen/subject_tocheck.csv')

In [None]:
stool_data[['collection_date', 'een_end_date']].to_csv('meta/een-mgen/sample_tocheck.csv')

In [None]:
stool_data.drop(columns=["collection_date", "een_end_date"]).to_csv('meta/een-mgen/stool.tsv', sep='\t')

In [None]:
subject_meta.drop(columns=['een_start_date', 'een_end_date', 'relapse_start_date']).to_csv('meta/een-mgen/subject.tsv', sep='\t')

In [None]:
microcosm_data.to_csv('meta/een-mgen/microcosm.tsv', sep='\t')