In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

# Create tabular datasets using sliding windows

* Use sliding window method to create temporally aware datasets.
* Use 10 year observation window and 10 year prediction window.
* Censor patients when: ovarian cancer diagnosis, death, bilateral oophorectomy or bilateral salpingo-oophorectomy, or 1-1-2022


In [None]:
# endo_demo = starting demographic data
# temporal_events = starting temporal data

### Get start and censor dates for both cohorts

In [None]:
start_dates = pd.read_pickle('start_dates.pkl')
# start_dates.head()

In [None]:
end_dates = pd.read_pickle('end_dates.pkl')
# end_dates.head()

### Create sliding window datasets

In [None]:
# Observation Window = 10 years
# Prediction Window = 10 years

endo_demo = endo_demo.merge(start_dates, how='inner',on='Masked_PersonID')
endo_demo = endo_demo.merge(end_dates, how='inner',on='Masked_PersonID')

# Initialize
endo_demo['obs_start'] = endo_demo['start_date']
endo_demo['obs_end'] = endo_demo['obs_start'] + pd.offsets.DateOffset(years=10)
endo_demo['pred_start'] = endo_demo['obs_end']
endo_demo['pred_end'] = endo_demo['pred_start'] + pd.offsets.DateOffset(years=10)

i = 1

temp_outcomes = endo_demo.loc[endo_demo['end_date'] >= endo_demo['obs_end']].reset_index(drop=True).copy()
temp_outcomes['age'] = (temp_outcomes['obs_end'] - temp_outcomes['BDate']) / np.timedelta64(1, 'Y')
temp_outcomes.loc[(temp_outcomes['pred_start'] <= temp_outcomes['ovarian_ca_dx_date']) &
                  (temp_outcomes['ovarian_ca_dx_date'] < temp_outcomes['pred_end']), 'ovarian_ca'] = 1
temp_outcomes = temp_outcomes.drop(columns=['BDate','ovarian_ca_dx_date'])
temp_outcomes['new_ID'] = temp_outcomes['Masked_PersonID'].astype(str) + '_' + str(i)
outcomes = temp_outcomes

while True:
    # Incremement
    i=i+1
    endo_demo['obs_start'] = endo_demo['obs_start'] + pd.offsets.DateOffset(years=1)
    endo_demo['obs_end'] = endo_demo['obs_end'] + pd.offsets.DateOffset(years=1)
    endo_demo['pred_start'] = endo_demo['pred_start'] + pd.offsets.DateOffset(years=1)
    endo_demo['pred_end'] = endo_demo['pred_end'] + pd.offsets.DateOffset(years=1)
    
    # Subset
    temp_outcomes = endo_demo.loc[endo_demo['end_date'] >= endo_demo['obs_end']].reset_index(drop=True).copy()
    
    # Check
    if len(temp_outcomes.index) <= 0:
        break
    
    temp_outcomes['age'] = (temp_outcomes['obs_end'] - temp_outcomes['BDate']) / np.timedelta64(1, 'Y')
    temp_outcomes.loc[(temp_outcomes['pred_start'] <= temp_outcomes['ovarian_ca_dx_date']) &
                      (temp_outcomes['ovarian_ca_dx_date'] < temp_outcomes['pred_end']), 'ovarian_ca'] = 1
    temp_outcomes = temp_outcomes.drop(columns=['BDate','ovarian_ca_dx_date'])
    temp_outcomes['new_ID'] = temp_outcomes['Masked_PersonID'].astype(str) + '_' + str(i)
    outcomes = pd.concat([outcomes, temp_outcomes])

In [None]:
observations = outcomes.merge(temporal_events.loc[(temporal_events['event'] != 'ovarian_ca') & 
                                                  (temporal_events['event'] != 'Unknown') & 
                                                  (temporal_events['event'] != 'bilateral_removal')], 
                              how='left', on='Masked_PersonID')

observations = observations.loc[(observations['obs_start'] <= observations['Event_Date']) &
                                (observations['Event_Date'] < observations['obs_end'])].reset_index(drop=True)

obs_pivot = observations[['new_ID','value','event']].pivot_table(values='value', index='new_ID', columns='event',
                                                                 aggfunc='max').reset_index()

final_combined = outcomes.merge(obs_pivot, how='left', on='new_ID')
final_combined = final_combined.drop(columns=['start_date','end_date','obs_start','obs_end','pred_start',
                                              'pred_end'])

In [None]:
final_combined.to_pickle('observation_10_prediction_10_endo.pkl')

In [None]:
print(final_combined['ovarian_ca'].sum())
final_combined

In [None]:
# Observation Window = 10 years
# Prediction Window = 10 years
# 1,047,367 rows
# 1413 ovarian cancer cases