In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# load diabetes cohort
diabetes = pd.read_csv('/content/drive/MyDrive/Data/Output/diabetes_cohort.csv')

# get enrollment data, then filter for patients with BENE_ID in diabetes file
enrollment = pd.read_csv('/content/drive/MyDrive/Data/Output/MemberYear.csv')
enrollment_diab = enrollment[enrollment['BENE_ID'].isin(diabetes['BENE_ID'])]
enrollment_diab = enrollment_diab[['BENE_ID', 'SEX_IDENT_CD', 'BENE_RACE_CD', 'BENE_ENROLLMT_REF_YR', 'AGE_CALC']]
enrollment_diab.sort_values(['BENE_ID', 'BENE_ENROLLMT_REF_YR'])

Unnamed: 0,BENE_ID,SEX_IDENT_CD,BENE_RACE_CD,BENE_ENROLLMT_REF_YR,AGE_CALC
5974,-10000010288010,1,5,2015,54
12262,-10000010288010,1,5,2016,55
18875,-10000010288010,1,5,2017,56
25877,-10000010288010,1,5,2018,57
33323,-10000010288010,1,5,2019,58
...,...,...,...,...,...
41167,-10000010254671,2,2,2021,80
49413,-10000010254671,2,2,2022,81
58085,-10000010254671,2,2,2023,82
67264,-10000010254671,2,2,2024,83


There are multiple rows per patient because AGE_CALC is calculated as age at the end of each enrollment year. We filter for only age at initial enrollment year.

In [8]:
# prompt: sort enrollment_diab by BENE_ID and BENE_ENROLLMT_REF_YR, then filter for only the first row per group
enrollment_diab = enrollment_diab.sort_values(['BENE_ID', 'BENE_ENROLLMT_REF_YR'])
enrollment_diab = enrollment_diab.groupby('BENE_ID').first().reset_index()
enrollment_diab

Unnamed: 0,BENE_ID,SEX_IDENT_CD,BENE_RACE_CD,BENE_ENROLLMT_REF_YR,AGE_CALC
0,-10000010288010,1,5,2015,54
1,-10000010288007,2,1,2015,53
2,-10000010287992,1,1,2015,76
3,-10000010287975,2,6,2015,53
4,-10000010287949,1,5,2015,65
...,...,...,...,...,...
3015,-10000010254731,1,2,2015,68
3016,-10000010254721,2,3,2015,71
3017,-10000010254682,1,1,2015,73
3018,-10000010254676,1,1,2015,71


In [9]:
# merge diabetes and enrollment_diab files on BENE_ID, then discard flags
diabetes_full = pd.merge(diabetes, enrollment_diab[['BENE_ID', 'BENE_ENROLLMT_REF_YR']], on='BENE_ID', how='left')
diabetes_full = diabetes_full.drop(columns=['IP_FLAG','OP_FLAG','PR_FLAG'])
diabetes_full

Unnamed: 0,BENE_ID,CCSR_CATEGORY_DESCRIPTION,YR,MONTH,BENE_ENROLLMT_REF_YR
0,-10000010288010,Obesity,2015,5,2015
1,-10000010288010,Socioeconomic/psychosocial factors,2015,10,2015
2,-10000010288010,Prediabetes,2015,11,2015
3,-10000010288010,Lifestyle/life management factors,2016,9,2015
4,-10000010288007,Hypertension with complications and secondary ...,2015,2,2015
...,...,...,...,...,...
18867,-10000010254671,Coronary atherosclerosis and other heart disease,2015,4,2015
18868,-10000010254671,Obesity,2015,6,2015
18869,-10000010254671,Prediabetes,2015,8,2015
18870,-10000010254671,Lifestyle/life management factors,2015,11,2015


In [13]:
# create time of diagnoses of disease using YR and MONTH (in months)
diabetes_full['TIME'] = diabetes_full['YR']*12 + diabetes_full['MONTH']

# create enrollment time (months), then compute time since enrollment
diabetes_full['ENROLLMT_TIME']  = diabetes_full['BENE_ENROLLMT_REF_YR']*12
diabetes_full['TIME_SINCE_ENROLLMT'] = diabetes_full['TIME'] - diabetes_full['ENROLLMT_TIME']

diabetes_full = diabetes_full.drop(columns=['YR','MONTH','BENE_ENROLLMT_REF_YR','TIME','ENROLLMT_TIME'])
diabetes_full.head()

Unnamed: 0,BENE_ID,CCSR_CATEGORY_DESCRIPTION,TIME_SINCE_ENROLLMT
0,-10000010288010,Obesity,5
1,-10000010288010,Socioeconomic/psychosocial factors,10
2,-10000010288010,Prediabetes,11
3,-10000010288010,Lifestyle/life management factors,21
4,-10000010288007,Hypertension with complications and secondary ...,2


In [14]:
# prompt: group by BENE_ID and check if there are any 'Prediabetes', 'Diabetes without complications', 'Diabetes with complications' for which TIME_SINCE_ENROLLMT are the same. How many BENE_IDs satisfy this?

# Group by BENE_ID and check if there are any 'Prediabetes', 'Diabetes without complications', 'Diabetes with complications'
# for which TIME_SINCE_ENROLLMT are the same.

grouped = diabetes_full.groupby('BENE_ID')
count = 0
for name, group in grouped:
  diabetes_diagnoses_group = group[group['CCSR_CATEGORY_DESCRIPTION'].isin(['Prediabetes', 'Diabetes without complications', 'Diabetes with complications'])]
  if len(diabetes_diagnoses_group) > 1:
    time_since_enrollmt_values = diabetes_diagnoses_group['TIME_SINCE_ENROLLMT'].unique()
    if len(time_since_enrollmt_values) < len(diabetes_diagnoses_group):
        count += 1

print(f"Number of BENE_IDs that satisfy the condition: {count}")

Number of BENE_IDs that satisfy the condition: 0


In [15]:
# save diabetes_transformed
diabetes_full.to_csv('/content/drive/MyDrive/Data/Output/diabetes_full.csv', index=False)
enrollment_diab.to_csv('/content/drive/MyDrive/Data/Output/enrollment_diab.csv', index=False)