# Propensity Matching

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [None]:
df = pd.read_csv('2-select.csv')

In [None]:
df.shape

---

In [None]:
t, n, m = 'ajcc_pathologic_t', 'ajcc_pathologic_n', 'ajcc_pathologic_m'

In [None]:
df_a = df[
    (
        (df[m]!='M1') &
        (df[n]!='N1') & 
        (df[n]!='N2') & 
        (df[n]!='N3')
    ) & (
        df[t].str.contains('T2')
    )
]

In [None]:
df_b = df[
    (
        (df[m]!='M1') &
        (df[n]!='N1') & 
        (df[n]!='N2') & 
        (df[n]!='N3')
    ) & (
        (df[t]=='T4a') |
        (df[t]=='T4') |
        (df[t].str.contains('T3'))
    )
]

In [None]:
df_c = df[
    (df[m]=='M1') | 
    (df[n]=='N1') | 
    (df[n]=='N2') | 
    (df[n]=='N3') |
    (df[t]=='T4b')
]

In [None]:
%%capture
df_a['stage_mibc'] = 'a'
df_b['stage_mibc'] = 'b'
df_c['stage_mibc'] = 'c'

In [None]:
df = pd.concat([df_a, df_b, df_c])

In [None]:
df.shape

---

In [None]:
v, s, g, r, a, pm, pt, trt = 'vital_status', 'stage_mibc', 'gender', 'race', 'age_at_index', 'prior_malignancy', 'prior_treatment', 'treatment_or_therapy'

In [None]:
df_inspect = df.filter([v,s,trt])

In [None]:
pd.DataFrame(
    df_inspect.value_counts()
).sort_values([v,s,trt])

In [None]:
dfA, dfD = df[df[v]=='Alive'], df[df[v]=='Dead']

Downsample the dead patients by propensity matching against characteristics of the alive patients

In [None]:
all_match = False
while not all_match:
    dfD_aNo = dfD[(dfD[s]=='a') & (dfD[trt]=='no')] #2 short
    dfD_aYes = dfD[(dfD[s]=='a') & (dfD[trt]=='yes')] #1 short    
    
    dfD_bNo = dfD[(dfD[s]=='b') & (dfD[trt]=='no')].sample(13) # extra 6 here
    dfD_bYes = dfD[(dfD[s]=='b') & (dfD[trt]=='yes')] # 3 short
    
    dfD_cNo = dfD[(dfD[s]=='c') & (dfD[trt]=='no')].sample(3)
    dfD_cYes = dfD[(dfD[s]=='c') & (dfD[trt]=='yes')].sample(7)
    dfD_cNR = dfD[(dfD[s]=='c') & (dfD[trt]=='not reported')].sample(1)
    
    
    dfD_sampled = pd.concat([
        dfD_aNo, dfD_aYes, dfD_bNo, dfD_bYes, dfD_cNo, dfD_cYes, dfD_cNR
    ])
    
    # --- Break out conditions ---
    num_alive_pm = dfA[dfA[pm]=='yes'].shape[0]
    num_dead_pm  = dfD_sampled[dfD_sampled[pm]=='yes'].shape[0]
    if (num_alive_pm==num_dead_pm):
    
        # Nesting for performance
        genders_match  = False
        num_alive_male = dfA[dfA[g]=='male'].shape[0]
        num_dead_male  = dfD_sampled[dfD_sampled[g]=='male'].shape[0]
        if (num_alive_male==num_dead_male):

            # Nesting for performance
            num_alive_nr  = dfA[dfA[r]=='not reported'].shape[0]
            num_dead_male = dfD_sampled[dfD_sampled[r]=='not reported'].shape[0]
            if (num_alive_nr==num_dead_male):
        
                # once final match is made
                all_match = True
df_match = pd.concat([dfA,dfD_sampled])

In [None]:
df_match.shape

In [None]:
# df_match = pd.read_csv('3-match.csv')

In [None]:
df_matched = df_match.filter([s,trt,v]).replace(
    {'a':'stageA', 'b':'stageB', 'c':'stageC', 'no':'_noChemo', 'yes':'_yesChemo', 'not reported':'_notReport'}
)

In [None]:
df_matched_alive = df_matched[
    df_matched[v] == 'Alive'
]

In [None]:
df_matched_alive = pd.DataFrame(
    df_matched_alive.groupby([s,trt]).size()
).reset_index().rename(columns={0:'count'})

In [None]:
df_matched_alive[v] = 'Alive'

In [None]:
df_matched_dead = df_matched[
    df_matched[v] == 'Dead'
]

In [None]:
df_matched_dead = pd.DataFrame(
    df_matched_dead.groupby([s,trt]).size()
).reset_index().rename(columns={0:'count'})

In [None]:
df_matched_dead[v] = 'Dead'

In [None]:
df_matched = pd.concat([df_matched_alive, df_matched_dead])

In [None]:
df_matched['Status: MIBC_Stage_&_Chemotherapy'] = df_matched[s] + df_matched[trt]

In [None]:
df_matched = df_matched.drop(columns=[s, trt])

In [None]:
px.histogram(
    df_matched, title="Propensity Matching by Pathological Stage & Adjuvant Chemotherapy Distributions", 
    x="Status: MIBC_Stage_&_Chemotherapy", y='count', color=v, 
    opacity=0.5, barmode='group', 
).update_layout(
    yaxis = dict(dtick=2)
)

In [None]:
df_pt = df_match.filter([pt,v])

In [None]:
pt_counts = pd.DataFrame(
    df_pt.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    pt_counts, title="Prior Treatment Distributions", 
    x=pt, y='counts', color=v, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title=pt)
)

In [None]:
df_pm = df_match.filter([pm,v])

In [None]:
pm_counts = pd.DataFrame(
    df_pm.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    pm_counts, title="Prior Malignancy Distributions", 
    x=pm, y='counts', color=v, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title=pm)
)

In [None]:
df_genders = df_match.filter([g,v])

In [None]:
gender_counts = pd.DataFrame(
    df_genders.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    gender_counts, title="Gender Distributions", 
    x=g, y='counts', color=v, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title=g)
)

The only race that I could get to match was "not reported"

In [None]:
df_race = df_match.filter([r,v])

In [None]:
race_counts = pd.DataFrame(
    df_race.value_counts()
).reset_index().rename(columns={0:'counts'})

In [None]:
px.histogram(
    race_counts, title="Race Distributions",
    x=r, y='counts', color=v, 
    opacity=0.5, barmode='group'
).update_layout(
    yaxis = dict(title='Count', dtick=5),
    xaxis = dict(title=r)
)

In [None]:
df_age = df_match.filter([a,v])

In [None]:
x0 = df_age[df_age[v]=='Alive'][a].tolist()
x1 = df_age[df_age[v]=='Dead'][a].tolist()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, nbinsx=5, name='Alive'))
fig.add_trace(go.Histogram(x=x1, nbinsx=5, name='Dead'))

# Overlay both histograms
fig.update_layout(
    barmode='overlay', title="Age Distributions (Overlaid)",
    yaxis = dict(title='Count', dtick=3),
    xaxis = dict(title=a, dtick=10,)
)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.35)
fig.show()

Commented out to prevent overwriting this critical file

In [None]:
# df_match.to_csv("3-match.csv",index=False)