# Simple notebook for analyzing midway cis-eQTL effect on longevity

In [199]:
import pandas as pd

# Read the input data into a Pandas DataFrame
data = pd.read_csv("../data/longevity_luisa.csv")

In [200]:
output = {}
indiv_id = 0

for col in data.columns[1::]:
    output[col] = {}
    for idx, row in data[["Day", col]].iterrows():
        if idx == 0:
            prev_day = data["Day"][idx]
            split_mf = row[1].split("/")
            prev_m = int(split_mf[0])
            prev_f = int(split_mf[1])
            continue

        split_mf = row[1].split("/")
        m = int(split_mf[0])
        f = int(split_mf[1])

        new_m = m
        new_f = f

        while m < prev_m:
            output[indiv_id] = {
                "genotype": col.split(" ")[0],
                "source": col,
                "day": prev_day,
                "sex": "m",
                "lifespan": prev_day,
                "annotator": "luisa",
            }
            indiv_id += 1
            m += 1

        while f < prev_f:
            output[indiv_id] = {
                "genotype": col.split(" ")[0],
                "source": col,
                "day": prev_day,
                "sex": "f",
                "lifespan": prev_day,
                "annotator": "luisa",
            }
            indiv_id += 1
            f += 1

        prev_f = new_f
        prev_m = new_m
        prev_day = row[0]

In [201]:
luisa_df = pd.DataFrame.from_dict(output, orient="index")

In [202]:
data = pd.read_csv("../data/longevity_amanda.csv")

In [203]:
output = {}
indiv_id = 0

for col in data.columns[1::]:
    output[col] = {}
    for idx, row in data[["Day", col]].iterrows():
        if idx == 0:
            prev_day = data["Day"][idx]
            split_mf = row[1].split("/")
            prev_m = int(split_mf[0])
            prev_f = int(split_mf[1])
            continue

        split_mf = row[1].split("/")
        m = int(split_mf[0])
        f = int(split_mf[1])

        day = row[0]
        new_m = m
        new_f = f

        while m < prev_m:
            output[indiv_id] = {
                "genotype": col.split(" ")[0],
                "source": col,
                "day": prev_day,
                "sex": "m",
                "lifespan": prev_day,
                "annotator": "amanda",
            }
            indiv_id += 1
            m += 1

        while f < prev_f:
            output[indiv_id] = {
                "genotype": col.split(" ")[0],
                "source": col,
                "day": prev_day,
                "sex": "f",
                "lifespan": prev_day,
                "annotator": "amanda",
            }
            indiv_id += 1
            f += 1

        prev_f = new_f
        prev_m = new_m
        prev_day = row[0]

In [204]:
amanda_df = pd.DataFrame.from_dict(output, orient="index")

In [205]:
merged_df = pd.concat([luisa_df, amanda_df])
merged_df.reset_index(inplace=True)

# Group as needed
merged_df["group"] = merged_df["genotype"].apply(
    lambda x: "control" if x != "Midway" else "midway"
)
merged_df["censored"] = False
merged_df["death"] = True

# Append two rows for individuals where we didn't observe their death
new_row = [299, "R181", "R181 1-C", 80, "f", 80, "amanda", "control", True, False]
merged_df = merged_df.append(
    pd.Series(new_row, index=merged_df.columns[: len(new_row)]), ignore_index=True
)
new_row = [300, "R181", "R181 1-C", 80, "f", 80, "amanda", "control", True, False]
merged_df = merged_df.append(
    pd.Series(new_row, index=merged_df.columns[: len(new_row)]), ignore_index=True
)

  merged_df = merged_df.append(pd.Series(new_row, index=merged_df.columns[:len(new_row)]), ignore_index=True)
  merged_df = merged_df.append(pd.Series(new_row, index=merged_df.columns[:len(new_row)]), ignore_index=True)


In [206]:
# Update the row where an individual escapes
merged_df.loc[
    merged_df[
        (merged_df["lifespan"] == 54)
        & (merged_df["sex"] == "m")
        & (merged_df["source"] == "R181 2-C")
        & (merged_df["annotator"] == "luisa")
    ].index[0],
    "censored",
] = True
merged_df.loc[
    merged_df[
        (merged_df["lifespan"] == 54)
        & (merged_df["sex"] == "m")
        & (merged_df["source"] == "R181 2-C")
        & (merged_df["annotator"] == "luisa")
    ].index[0],
    "death",
] = False

In [207]:
# Make sure we are censoring the right rows!
merged_df[merged_df["censored"] == True]

Unnamed: 0,index,genotype,source,day,sex,lifespan,annotator,group,censored,death
47,47,R181,R181 2-C,54,m,54,luisa,control,True,False
298,299,R181,R181 1-C,80,f,80,amanda,control,True,False
299,300,R181,R181 1-C,80,f,80,amanda,control,True,False


In [208]:
import statsmodels.api as sm
from statsmodels.duration.hazard_regression import PHReg

In [209]:
surv_df = merged_df

In [210]:
midway_00 = surv_df[surv_df["group"] == "control"]
midway_11 = surv_df[surv_df["group"] == "midway"]

In [211]:
midway_00.sort_values(by=["lifespan"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  midway_00.sort_values(by=['lifespan'], inplace=True)


In [212]:
midway_00.iloc[119]

index             72
genotype          CS
source        CS 2-C
day               47
sex                m
lifespan          47
annotator     amanda
group        control
censored       False
death           True
Name: 222, dtype: object

In [213]:
sf_00 = sm.SurvfuncRight(midway_00["lifespan"], midway_00["death"])
sf_11 = sm.SurvfuncRight(midway_11["lifespan"], midway_11["death"])

In [218]:
results = PHReg.from_formula(
    "lifespan ~ C(sex) + C(group)", data=surv_df, status="death"
).fit()
results.summary()

0,1,2,3
Model:,PH Reg,Sample size:,300.0
Dependent variable:,lifespan,Num. events:,297.0
Ties:,Breslow,,

0,1,2,3,4,5,6,7
,log HR,log HR SE,HR,t,P>|t|,[0.025,0.975]
C(sex)[T.m],0.0399,0.1182,1.0407,0.3374,0.7358,0.8255,1.3119
