# Dead vs Dead

For false discovery purposes

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [None]:
df = pd.read_csv('2-select.csv')

In [None]:
df = df[df['vital_status']=='Dead']

In [None]:
df.shape

---

In [None]:
t, n, m = 'ajcc_pathologic_t', 'ajcc_pathologic_n', 'ajcc_pathologic_m'

In [None]:
df_a = df[
    (
        (df[m]!='M1') &
        (df[n]!='N1') & 
        (df[n]!='N2') & 
        (df[n]!='N3')
    ) & (
        df[t].str.contains('T2')
    )
]

In [None]:
df_b = df[
    (
        (df[m]!='M1') &
        (df[n]!='N1') & 
        (df[n]!='N2') & 
        (df[n]!='N3')
    ) & (
        (df[t]=='T4a') |
        (df[t]=='T4') |
        (df[t].str.contains('T3'))
    )
]

In [None]:
df_c = df[
    (df[m]=='M1') | 
    (df[n]=='N1') | 
    (df[n]=='N2') | 
    (df[n]=='N3') |
    (df[t]=='T4b')
]

In [None]:
%%capture
df_a['stage_mibc'] = 'a'
df_b['stage_mibc'] = 'b'
df_c['stage_mibc'] = 'c'

In [None]:
df = pd.concat([df_a, df_b, df_c])

In [None]:
df.shape

---

In [None]:
s, g, r, a, pm, pt, trt = 'stage_mibc', 'gender', 'race', 'age_at_index', 'prior_malignancy', 'prior_treatment', 'treatment_or_therapy'

In [None]:
df_inspect = df.filter([s,trt])

In [None]:
pd.DataFrame(
    df_inspect.value_counts()
).sort_values([s,trt])

In [None]:
l='Label'
all_match = False
# while not all_match:
# Assign arbitrary labels to half of the cohort
dfA = df.sample(60)
dfA[l] = 'A'
dfA_cases = dfA['case_id'].tolist()
dfD = df[~df['case_id'].isin(dfA_cases)]
dfD[l] = 'D'
df_match = pd.concat([dfA,dfD])

In [None]:
df_match.shape

In [None]:
df_matched = df_match.filter([s,trt,l]).replace(
    {'a':'stageA', 'b':'stageB', 'c':'stageC', 'no':'_noChemo', 'yes':'_yesChemo', 'not reported':'_notReport'}
)

In [None]:
df_matched_alive = df_matched[
    df_matched[l] == 'A'
]

In [None]:
df_matched_alive = pd.DataFrame(
    df_matched_alive.groupby([s,trt]).size()
).reset_index().rename(columns={0:'count'})

In [None]:
df_matched_alive[l] = 'A'

In [None]:
df_matched_dead = df_matched[
    df_matched[l] == 'D'
]

In [None]:
df_matched_dead = pd.DataFrame(
    df_matched_dead.groupby([s,trt]).size()
).reset_index().rename(columns={0:'count'})

In [None]:
df_matched_dead[l] = 'D'

In [None]:
df_matched = pd.concat([df_matched_alive, df_matched_dead])

In [None]:
df_matched['Status: MIBC_Stage_&_Chemotherapy'] = df_matched[s] + df_matched[trt]

In [None]:
df_matched = df_matched.drop(columns=[s, trt])

In [None]:
px.histogram(
    df_matched, title="Matching by Muscle-Invasive Bladder Cancer (MIBC) Stage & Adjuvant Chemotherapy Distributions", 
    x="Status: MIBC_Stage_&_Chemotherapy", y='count', color=l, 
    opacity=0.5, barmode='group', 
).update_layout(
    yaxis = dict(dtick=2)
)

In [None]:
df_pt = df_match.filter([pt,l])

In [None]:
pt_counts = pd.DataFrame(
    df_pt.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    pt_counts, title="Prior Treatment Distributions", 
    x=pt, y='counts', color=l, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title='Gender')
)

In [None]:
df_pm = df_match.filter([pm,l])

In [None]:
pm_counts = pd.DataFrame(
    df_pm.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    pm_counts, title="Prior Malignancy Distributions", 
    x=pm, y='counts', color=l, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title='Gender')
)

In [None]:
df_genders = df_match.filter([g,l])

In [None]:
gender_counts = pd.DataFrame(
    df_genders.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    gender_counts, title="Gender Distributions", 
    x=g, y='counts', color=l, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title='Gender')
)

The only race that I could get to match was "not reported"

In [None]:
df_race = df_match.filter([r,l])

In [None]:
race_counts = pd.DataFrame(
    df_race.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    race_counts, title="Race Distributions",
    x=r, y='counts', color=l, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title='Race')
)

In [None]:
df_age = df_match.filter([a,l])

In [None]:
x0 = df_age[df_age[l]=='A'][a].tolist()
x1 = df_age[df_age[l]=='D'][a].tolist()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, nbinsx=5, name='A'))
fig.add_trace(go.Histogram(x=x1, nbinsx=5, name='D'))

# Overlay both histograms
fig.update_layout(
    barmode='overlay', title="Age Distributions (Overlaid)",
    yaxis = dict(title='Count', dtick=3),
    xaxis = dict(title='Age', dtick=10,)
)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.35)
fig.show()

Commented out to prevent overwriting this critical file

In [None]:
df_match.to_csv("4-deadMatch.csv",index=False)