In [None]:
%load_ext autoreload

In [None]:
import os as _os

_os.chdir(_os.environ["PROJECT_ROOT"])
_os.path.realpath(_os.path.curdir)

In [None]:
import pandas as pd

In [None]:
subject_meta = (
    pd.read_csv(
        "raw/een-mgen/2023-07-28_Kolja.Siebert@med.uni-muenchen.de/1.0_Metadata_sharedwithByron_Shotgunmetagenomics_CRC1371Juli2023_datesfixed_sheet2.csv",
        parse_dates=False,
    )
    .rename(
        columns={
            "Patient_ID": "subject_id",
            "Start_date_EEN": "een_start_date",
            "End_date_EEN": "een_end_date",
            "Relapse1year": "endpoint_patient",
            "Date_Relapse_Start": "relapse_start_date",
        }
    )
    .set_index("subject_id")
    # .assign(
    #     relapse_start_date=lambda s: s.relapse_start_date.str.replace(
    #         ".", "/"
    #     ),  # One relapse date was written with '.' separators instead of '/'
    #     een_start_date=lambda s: s.een_start_date.str.replace(
    #         ".", "/"
    #     ),  # One start date was written with '.' separators instead of '/'
    # )
    .assign(
        een_start_date=lambda x: pd.to_datetime(x.een_start_date, format="%m/%d/%y"),
        een_end_date=lambda x: pd.to_datetime(x.een_end_date, format="%m/%d/%y"),
        relapse_start_date=lambda x: pd.to_datetime(
            x.relapse_start_date, format="%m/%d/%y"
        ),
        endpoint_patient_relapse=lambda x: x.endpoint_patient.fillna("no"),
    )
    .assign(
        een_start_date_relative_een_end=lambda x: (
            x.een_start_date - x.een_end_date
        ).dt.days,
        relapse_start_date_relative_een_end=lambda x: (
            x.relapse_start_date - x.een_end_date
        ).dt.days,
    )
)

assert subject_meta.index.is_unique

subject_meta  # .apply(lambda x: x.value_counts().index.values)

In [None]:
new_mgen_data = (
    pd.read_csv(
        "raw/een-mgen/2023-08-30_deborah.haecker@tum-create.edu.sg/mapping_file_Byron.csv"
    )
    .dropna(subset=["ID"])
    .astype({"ID": int})
    .assign(
        collection_date=lambda x: pd.to_datetime(x.sampleDate, format="%d.%m.%Y"),
    )[
        lambda x: x.Patient_ID != "X"
    ]  # Patient X seems highly atypical and data is not consistent.
    .join(subject_meta, on="Patient_ID")
    .assign(
        collection_date_relative_een_end=lambda x: (
            x.collection_date - x.een_end_date
        ).dt.days,
        has_metabolome=lambda x: x["matched metabolome avaliable?"] == "yes",
        status_medication_antibiotics=lambda x: x[lambda x: x.organism == 'human']["Antibiotics"]
        .fillna("")
        .str.lower()
        .str.startswith("yes"),
        status_medication_mtx=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('MTX'),
        status_medication_antitnf=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('AntiTNF'),
        status_medication_other=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('OtherMedication'),
        sample_comments="",
    )
    .rename(
        columns={
            "Seq-Name": "sample_id",
            "Patient_ID": "subject_id",
            "organism": "sample_type",
            "active_inactive": "status_disease_activity",
            # "which_medication": "status_medication",
            # "on_antibiotics": "status_antibiotics",  # Watch out! Antibiotics is a different column name.
            "Therapy_weeks": "timepoint",
            "Therapy_cluster_more": "timepoint_type",
            "HealingGroup": "patient_outcome",
            "Transfer_used_Inoculum": "source_samples",
            "mouse_Histoscoring_Colon": "status_mouse_colon_histoscore",
            "mouse_inflamed_not_inflamed": "status_mouse_inflamed",
            "human_transfer_Diet_Media": "diet_or_media",
        }
    )
)

new_mgen_data#.sort_values(["subject_id", "sample_type"])

In [None]:
d = new_mgen_data[
    [
        "sample_id",
        "subject_id",
        "collection_date_relative_een_end",
        "diet_or_media",
        "sample_type",
        "status_disease_activity",
        "status_medication_mtx",
        "status_medication_antitnf",
        "status_medication_antibiotics",
        "status_medication_other",
        "timepoint",
        "timepoint_type",
        "source_samples",
        "mouse_genotype",
        "status_mouse_colon_histoscore",
        "status_mouse_inflamed",
        "has_metabolome",
        "sample_comments",
    ]
].set_index('sample_id')

# d.to_csv('meta/een-mgen/sample.tsv', sep='\t')
d

In [None]:
d = subject_meta[['een_start_date_relative_een_end', 'endpoint_patient_relapse', 'relapse_start_date_relative_een_end']]
# d.to_csv("meta/een-mgen/subject.tsv", sep="\t")
d

In [None]:
new_new_mgen_data = (
    pd.read_csv(
        "raw/een-mgen/2023-09-25_deborah.haecker@tum-create.edu.sg/mapping_file_Byron_final2.csv"
    )
    .dropna(subset=["ID"])
    .astype({"ID": int})
    .assign(
        collection_date=lambda x: pd.to_datetime(x.sampleDate, format="%d.%m.%Y"),
    )
    # [
    #     lambda x: x.Patient_ID != "X"
    # ]  # Patient X seems highly atypical and data is not consistent.
    .join(subject_meta, on="Patient_ID")
    .assign(
        collection_date_relative_een_end=lambda x: (
            x.collection_date - x.een_end_date
        ).dt.days,
        has_metabolome=lambda x: x["matched metabolome avaliable?"] == "yes",
        status_medication_antibiotics=lambda x: x[lambda x: x.organism == 'human']["Antibiotics"]
        .fillna("")
        .str.lower()
        .str.startswith("yes"),
        status_medication_mtx=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('MTX'),
        status_medication_antitnf=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('AntiTNF'),
        status_medication_other=lambda x: x[lambda x: x.organism == 'human'].which_medication.fillna('').str.contains('OtherMedication'),
        sample_comments="",
    )
    .rename(
        columns={
            "Seq-Name": "sample_id",
            "Patient_ID": "subject_id",
            "organism": "sample_type",
            "active_inactive": "status_disease_activity",
            # "which_medication": "status_medication",
            # "on_antibiotics": "status_antibiotics",  # Watch out! Antibiotics is a different column name.
            "Therapy_weeks": "timepoint",
            "Therapy_cluster_more": "timepoint_type",
            "HealingGroup": "patient_outcome",
            "Transfer_used_Inoculum": "source_samples",
            "mouse_Histoscoring_Colon": "status_mouse_colon_histoscore",
            "mouse_inflamed_not_inflamed": "status_mouse_inflamed",
            "human_transfer_Diet_Media": "diet_or_media",
        }
    )
)

new_new_mgen_data#.sort_values(["subject_id", "sample_type"])

In [None]:
d = new_new_mgen_data[
    [
        "sample_id",
        "subject_id",
        "collection_date_relative_een_end",
        "diet_or_media",
        "sample_type",
        "status_disease_activity",
        "status_medication_mtx",
        "status_medication_antitnf",
        "status_medication_antibiotics",
        "status_medication_other",
        "timepoint",
        "timepoint_type",
        "source_samples",
        "mouse_genotype",
        "status_mouse_colon_histoscore",
        "status_mouse_inflamed",
        "has_metabolome",
        "sample_comments",
    ]
].set_index('sample_id').sort_index()

# d.to_csv('meta/een-mgen/sample.tsv', sep='\t')
d