# Data Exploration and Preprocessing
TCGA Reannotated Ovarian Cancer Clinical Data 

In [108]:
import numpy as np
import pandas as pd
from functools import reduce
import itertools

Import Data

In [109]:
# Villalobos 2018 reannotated TCGA data (https://ascopubs.org/doi/suppl/10.1200/CCI.17.00096)
tcga_ov_1 = pd.read_excel('https://github.com/bmurphy1993/Cancer_Reinforcement_Learning/raw/main/Data/Villalobos_TCGA/ds_CCI.17.00096-1.xlsx',
                          sheet_name='Master clinical dataset'
                          )

tcga_ov_2 = pd.read_excel('https://github.com/bmurphy1993/Cancer_Reinforcement_Learning/raw/main/Data/Villalobos_TCGA/ds_CCI.17.00096-2.xlsx',
                          sheet_name='Months'
                          )

tcga_ov_3 = pd.read_excel('https://github.com/bmurphy1993/Cancer_Reinforcement_Learning/raw/main/Data/Villalobos_TCGA/ds_CCI.17.00096-3.xlsx',
                          sheet_name='clinical_drug_all_OV.txt'
                          )

# TCGA Drug standardization (https://gdisc.bme.gatech.edu/cgi-bin/gdisc/tap5.cgi#)
drugs = pd.read_csv('https://raw.githubusercontent.com/bmurphy1993/Cancer_Reinforcement_Learning/main/Data/DrugCorrection1.csv')
drugs['Correction'] = drugs['Correction'].str.strip()

## Clean and Reorganize

TCGA 3: Clean

In [110]:
# Drop columns with all missing values
print(tcga_ov_3.shape)
print(tcga_ov_3.isnull().sum(), '\n')
tcga_ov_3_drop = tcga_ov_3.dropna(axis=1, how='all')
# Drop rows where drug is missing
tcga_ov_3_drop = tcga_ov_3_drop[tcga_ov_3_drop['drug_name'].notna()]
# Drop rows where start AND end are missing (right now it's actuall OR, see below)
tcga_ov_3_drop = tcga_ov_3_drop.dropna(how='any', subset=['days_to_drug_therapy_end', 'days_to_drug_therapy_start']) #change to how='all' when figure out what to do with start OR end missing

# Standardize drug names
    # Merge using values from TCGA drug standardization (https://gdisc.bme.gatech.edu/cgi-bin/gdisc/tap5.cgi)
tcga_ov_3_clean = tcga_ov_3_drop.merge(drugs, how='left', left_on='drug_name', right_on='OldName').drop(['OldName'], axis=1)
    # Additional replacement rules
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'Doxoribicin', 'Correction'] = 'Doxorubicin'
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'gemcitabin', 'Correction'] = 'Gemcitabine'
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'Hexlalen', 'Correction'] = 'Altretamine'
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'Cisplatin/Gemzar', 'Correction'] = 'Cisplatin' # This applies to only one line, which has another sample just for Gemzar (Gemcitabine)
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'Ilex', 'Correction'] = 'Ilex'
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'ILIZ', 'Correction'] = 'ILIZ'
tcga_ov_3_clean.loc[tcga_ov_3_clean['drug_name'] == 'Lily', 'Correction'] = 'Lily'

    # Print replacement rules and replace 'drug_name'
drug_name_old = tcga_ov_3_clean['drug_name']
drug_name_new = tcga_ov_3_clean['Correction']
rules = pd.DataFrame({'drug_name_old': drug_name_old, 'drug_name_new': drug_name_new}).drop_duplicates().sort_values(by=['drug_name_old']).reset_index().drop('index', axis=1)

pd.set_option('display.max_rows', None)
print('Replacement Rules:\n', rules, '\n')
pd.reset_option('max_rows')

tcga_ov_3_clean['drug_name'] = tcga_ov_3_clean['Correction']
tcga_ov_3_clean = tcga_ov_3_clean.drop('Correction', axis=1)

    # List of drugs in dataset
drug_list = [x for x in list(tcga_ov_3_clean['drug_name'].drop_duplicates()) if str(x) != 'nan']
drug_list.sort()
print('Unique Drugs:', len(drug_list), '\n', drug_list, '\n')

# Drop where therapy start = therapy end
tcga_ov_3_clean = tcga_ov_3_clean[tcga_ov_3_clean['days_to_drug_therapy_end'] != tcga_ov_3_clean['days_to_drug_therapy_start']] 

(2463, 19)
bcr_patient_barcode                 0
bcr_drug_barcode                    0
days_to_drug_therapy_end          339
days_to_drug_therapy_start        145
days_to_drug_treatment_end       2463
days_to_drug_treatment_start     2463
dosage_units                     2463
drug_category                       0
drug_dosage                      2463
drug_name                          11
initial_course                   2463
number_cycles                     391
regimen_indication                  2
regimen_indication_notes         2328
route_of_administration           243
route_of_administration_notes    2126
therapy_ongoing                   167
total_dose                        746
total_dose_units                  715
dtype: int64 

Replacement Rules:
                         drug_name_old            drug_name_new
0                      5F4 Leucovorin  Fluorouracil+Leucovorin
1                         90Y-HU3S193                  Hu3S193
2                             AMG 706      

TCGA 3: Fix/standardize time variables and fix order of therapy lines

In [111]:
# Fix values where start and end are switched
tcga_ov_3_clean.loc[tcga_ov_3_clean['days_to_drug_therapy_start'] > tcga_ov_3_clean['days_to_drug_therapy_end'], ['days_to_drug_therapy_start', 'days_to_drug_therapy_end']] = tcga_ov_3_clean.loc[tcga_ov_3_clean['days_to_drug_therapy_start'] > tcga_ov_3_clean['days_to_drug_therapy_end'], ['days_to_drug_therapy_end', 'days_to_drug_therapy_start']].values

# Set earliest drug therapy start to zero and subtract everything else by min days
ther_start = tcga_ov_3_clean.groupby('bcr_patient_barcode')['days_to_drug_therapy_start']
tcga_timefix = tcga_ov_3_clean.assign(start_day=ther_start.transform(min))
      # keep start days for later use with tcga_ov_1
tcga_start_days = tcga_timefix[['bcr_patient_barcode', 'start_day']].drop_duplicates()

tcga_timefix['therapy_start'] = tcga_timefix['days_to_drug_therapy_start'] - tcga_timefix['start_day']
tcga_timefix['therapy_end'] = tcga_timefix['days_to_drug_therapy_end'] - tcga_timefix['start_day']
tcga_timefix = tcga_timefix.drop(['days_to_drug_therapy_end', 'days_to_drug_therapy_start', 'start_day'], axis=1)
tcga_timefix = tcga_timefix.sort_values(by=['bcr_patient_barcode', 'therapy_start', 'therapy_end'])

# Set up state list for each patient: barcode, timing, drug combo
tcga_drug_lines = []
for barcode in tcga_timefix['bcr_patient_barcode'].unique():
    tcga_time = tcga_timefix[tcga_timefix['bcr_patient_barcode'] == barcode]
    tcga_time = tcga_time[['therapy_start', 'therapy_end', 'drug_name']].drop_duplicates(keep='first').values.tolist() # Drop duplicate drugs that have different dosages or administration but same timing

    points = [] # list of (offset, plus/minus, drug) tuples
    for start,stop,drug in tcga_time:
        points.append((start,'+',drug))
        points.append((stop,'-',drug))
    points.sort()

    ranges = [] # output list of (start, stop, drug_set) tuples
    current_set = []
    last_start = None
    for offset,pm,drug in points:
        if pm == '+':
            if last_start is not None:
                ranges.append([last_start,offset,list(set(current_set.copy()))])
            current_set.append(drug)
            last_start = offset
        elif pm == '-':
            ranges.append([last_start,offset,list(set(current_set.copy()))])
            current_set.remove(drug)
            last_start = offset

    # Finish off
    if last_start is not None:
        ranges.append([last_start,offset,list(set(current_set.copy()))])

    # Remove the ranges where start = stop
    range_drug = []
    for i in range(len(ranges)):
        if ranges[i][0] != ranges[i][1]: # add condition:  <& (ranges[i][2] != [])> if drop no-drug periods
            range_drug.append(ranges[i])

    # Remove overlapping/back-to-back duplicate lines. Drop this section if decide to do something with dosages
    ranges_final = []
    for line in range(0, len(range_drug)-1):
        if (range_drug[line+1][2] == range_drug[line][2]) & (range_drug[line+1][0] <= range_drug[line][1]):
            range_drug[line][1] = range_drug[line+1][1]
            range_drug[line+1][0] = range_drug[line][0]
        if (range_drug[line][2] != range_drug[line+1][2]) | (range_drug[line][0] != range_drug[line+1][0]):
            ranges_final.append(range_drug[line])
    if (range_drug[len(range_drug)-1][2] != range_drug[len(range_drug)-2][2]) | (range_drug[len(range_drug)-1][0] != range_drug[len(range_drug)-2][0]):
        ranges_final.append(range_drug[len(range_drug)-1])
    
    # Add the number of previous lines of therapy
    for line in range(len(ranges_final)):
        if line == 0:
            ranges_final[line].append(0) 
        elif ranges_final[line-1][2] == []:
            ranges_final[line].append(ranges_final[line-1][3])
        else:
            ranges_final[line].append(ranges_final[line-1][3] + 1)

    for line in range(len(ranges_final)):
        ranges_final[line].insert(0, barcode)

    tcga_drug_lines.extend(ranges_final)

tcga_drug_lines[3] = ['TCGA-04-1332', 0.0, 151.0, ['Carboplatin', 'Paclitaxel', 'Topotecan'], 0] # Special case to fix. Make sure to check this if make changes above

# Back to df
lines_df = pd.DataFrame(tcga_drug_lines, columns=['bcr_patient_barcode', 'start', 'end', 'therapy', 'previous_lines'])

# List of patient barcodes
tcga_barcodes = list(lines_df['bcr_patient_barcode'].unique())

# Notes
    # One thing to be aware of is that this code drops all values where therapy start and therapy end are equal

TCGA 1: Clean

In [113]:
# Keep subset of variables 
print('NaNs: ', tcga_ov_1.isnull().sum())
tcga_ov_1_keep = tcga_ov_1[['bcr_patient_barcode',
                            'total_days_overall_survival',
                            'outcome_overall_survival_censoring',
                            # 'vital_status',
                            # 'days_to_tumor_progression',
                            # 'days_to_death',
                            # 'days_to_last_followup',
                            # 'days_to_tumor_recurrence',
                            # 'time_to_failure',  
                            # 'Cycles_of_adjuvant_therapy',
                            # 'Adjuvant_chemotherapy_dose_intensity',
                            'age_at_initial_pathologic_diagnosis',
                            # 'anatomic_organ_subdivision',
                            # 'days_to_birth',
                            # 'initial_pathologic_diagnosis_method',
                            # 'person_neoplasm_cancer_status',
                            # 'pretreatment_history',
                            # 'primary_therapy_outcome_success', # ask EKO about this
                            'race',
                            # 'residual_tumor',
                            # 'site_of_tumor_first_recurrence',
                            # 'tissue_source_site',
                            'tumor_grade',
                            # 'tumor_residual_disease',
                            'tumor_stage',
                            # 'tumor_tissue_site'
                            # 'year_of_initial_pathologic_diagnosis',
                            # 'Days off platinum prior to recurrence 1st line',
                            # 'Last day of platinum 1st line',
                            # 'Chemotherapy number of lines of therapy'
                            ]]

# Drop cases that don't have a survival metric
tcga_ov_1_keep = tcga_ov_1_keep.dropna(subset=['total_days_overall_survival'])
tcga_ov_1_keep.drop(tcga_ov_1_keep[tcga_ov_1_keep['total_days_overall_survival'] == 'cannot assess'].index, inplace=True)

# Only keep samples that are in the cleaned 'lines' data
tcga_ov_1_keep = tcga_ov_1_keep[tcga_ov_1_keep['bcr_patient_barcode'].isin(tcga_barcodes)].sort_values(by=['bcr_patient_barcode']).reset_index().drop('index', axis=1)

# Adjust final survival by start of therapy day
tcga_start_days = tcga_start_days[tcga_start_days['bcr_patient_barcode'].isin(tcga_barcodes)].sort_values(by=['bcr_patient_barcode']).reset_index().drop('index', axis=1)
tcga_ov_1_keep['total_days_overall_survival'] = tcga_ov_1_keep['total_days_overall_survival'] - tcga_start_days['start_day'] # this needs work, getting some negative values

tcga_ov_1_keep

NaNs:  bcr_patient_barcode                                 4
total_days_overall_survival                        28
outcome_overall_survival_censoring                 16
vital_status                                       34
days_to_tumor_progression                         570
                                                 ... 
5th_chemo_regimen_days_outcome                    524
6th_chemo_regimen_days_outcome                    552
Days off platinum prior to recurrence 1st line     99
Last day of platinum 1st line                     108
Chemotherapy number of lines of therapy            27
Length: 72, dtype: int64


Unnamed: 0,bcr_patient_barcode,total_days_overall_survival,outcome_overall_survival_censoring,age_at_initial_pathologic_diagnosis,race,tumor_grade,tumor_stage
0,TCGA-04-1331,1300,1,79.0,WHITE,G3,IIIC
1,TCGA-04-1332,1217,1,70.0,WHITE,G3,IIIC
2,TCGA-04-1336,1445,0,55.0,WHITE,G3,IIIB
3,TCGA-04-1338,1418,0,78.0,WHITE,G3,IIIC
4,TCGA-04-1342,531,1,80.0,WHITE,G2,IV
...,...,...,...,...,...,...,...
273,TCGA-61-2109,615,1,40.0,WHITE,G3,IIIC
274,TCGA-61-2110,1334,1,56.0,WHITE,,IIIC
275,TCGA-61-2111,3795,0,61.0,WHITE,G3,IV
276,TCGA-61-2113,627,1,54.0,WHITE,G3,IIC


Add death event to drug lines data

In [115]:
# Merge in final death event for each patient
lines_df_2 = lines_df.merge(tcga_ov_1_keep[['bcr_patient_barcode', 'outcome_overall_survival_censoring']], on='bcr_patient_barcode').rename(columns={'bcr_patient_barcode': 'patient', 'outcome_overall_survival_censoring': 'death'})

# Only keep death event on last line
lines_df_2.loc[lines_df_2['patient'] == lines_df_2['patient'].shift(-1), 'death'] = 0

# Function to get indices of a therapy
def get_index_pos(my_list, val):
    return [i for i, x in enumerate(my_list) if x == val]

## Create MDP objects

Action set

In [112]:
combos = []
for i in range(len(tcga_drug_lines)):
    combos.append(tcga_drug_lines[i][3])

combos.sort()
combos = list(combos for combos,_ in itertools.groupby(combos))
print(len(combos))
combos

128


[[],
 ['Abagovomab'],
 ['Aldesleukin'],
 ['Altretamine'],
 ['Aminocamptothecin'],
 ['Anastrozole'],
 ['Bevacizumab'],
 ['CBP501'],
 ['CEP -11981'],
 ['Capecitabine'],
 ['Capecitabine', 'Docetaxel'],
 ['Carboplatin'],
 ['Carboplatin', 'Bevacizumab'],
 ['Carboplatin', 'Cisplatin', 'Gemcitabine'],
 ['Carboplatin', 'Cyclophosphamide'],
 ['Carboplatin', 'Cyclophosphamide', 'Topotecan'],
 ['Carboplatin', 'Docetaxel'],
 ['Carboplatin', 'Docetaxel', 'Bevacizumab'],
 ['Carboplatin', 'Docetaxel', 'Leuprolide'],
 ['Carboplatin', 'Doxorubicin'],
 ['Carboplatin', 'Etoposide'],
 ['Carboplatin', 'Gemcitabine'],
 ['Carboplatin', 'Gemcitabine', 'Bevacizumab'],
 ['Carboplatin', 'Ifosfamide'],
 ['Carboplatin', 'Paclitaxel', 'Topotecan'],
 ['Carboplatin', 'Tamoxifen'],
 ['Carboplatin', 'Tamoxifen', 'Docetaxel'],
 ['Carboplatin', 'Tamoxifen', 'Gemcitabine'],
 ['Carboplatin', 'Tamoxifen', 'Paclitaxel'],
 ['Catumaxomab'],
 ['Cediranib'],
 ['Cetuximab'],
 ['Chlorambucil'],
 ['Cisplatin'],
 ['Cisplatin', 'Cycl

Intermittant reward matrix for each patient

In [116]:
nactions = len(combos) # action set is list of unique drug combos
nstates = 2 # two state probabilities: Death and Survive

# Initialize transition matrix
transitions = np.zeros((nactions,nstates))

# Calculate probabilities
for i in range(nactions):
    inds = get_index_pos(list(lines_df_2['therapy']), combos[i])

    d_prob = lines_df_2[['death']].iloc[inds, :].sum().values[0] / len(inds)

    transitions[i][0] = d_prob
    transitions[i][1] = 1 - d_prob

transitions

array([[0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.2       , 0.8       ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.08641975, 0.91358025],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.25      , 0.75      ],
       [1.        , 0.        ],
       [0.13513514, 0.86486486],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.15384615, 0.84615385],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.

Reward matrix

## Scratch/notes

Notes
* Add in "remission" as a state (i.e. no treatment)
  - Calculate similar to probability of death: probability that a no-treatment period follows a treatment
  - Then need to calculate probability each prob following "remission":
    - Death
    - Survive
    - More treatment i.e. "progression/recurrence" 
  - If do this, no-treatment ("[]") will not be in the action set

* "Grid world" verion of action space
  - 56 dimensional object with up to 5 drugs activated at once
  - \> 4mm possible combinations, only 127 drug combos in data - is this necessary? 

* Rewards
  - I think in the final version this should be a probability distribution returning a number of days of survival/until treatment failure after each action
  - For starters, just try to get the average days survived when therapy doesn't result in death.
  - Also need to think of something for when patients don't die
    - Maybe just probability of death and a final non-death stop state?
    - Death vs. no death in the end doesn't matter unless the no deaths get a final reward

In [118]:
# fix because including no drug periods and using df

# tcga_rewards = {}
# for barcode in tcga_lines_keys:
#     rewards = []
#     for line in range(len(tcga_drug_lines[barcode])):
#         try:
#             rewards.append(tcga_drug_lines[barcode][line+1][0] - tcga_drug_lines[barcode][line][0])
#         except:
#             rewards.append(tcga_ov_1_keep.loc[tcga_ov_1_keep['bcr_patient_barcode'] == barcode, 'total_days_overall_survival'].iloc[0] - tcga_drug_lines[barcode][line][0])

#     tcga_rewards[barcode] = rewards

# tcga_rewards

In [119]:
# negs = []
# for barcode in tcga_rewards:
#     if tcga_rewards[barcode][-1] < 0:
#         negs.append(barcode)

# negs

In [120]:
# Add in final state for each patient: [time, 'death']
# Figure out what to do with patients where overall survival is < the end of the last therapy line

In [121]:
# define transition matrix
# def transition(state, action): 
#     if state,action = (living, drug A): 
#         then return 1 if np.random() < 0.8 …. 
        
#     if state, action = (living, drug B), return 1 
    
#     if ..
