In [1]:
%matplotlib inline
import re
import pandas as pd

In [2]:
with open('../data/VA_My_HealtheVet_Blue_Button_Sample_Version_12_10.txt') as f:
    content = f.readlines()

In [3]:
full_doc = "".join(content)

In [4]:
search_string = "Immunization:\s+(.*?)\s+.*?Date Received:\s+(.*?)$"

In [5]:
results = re.findall(search_string, full_doc, re.DOTALL | re.MULTILINE)

In [6]:
results

[('INFLUENZA-H1N1-09,', '07 Dec 2012 @ 1155'),
 ('INFLUENZA-H1N1-09,', '01 Oct 2012 @ 1200'),
 ('PNEUMOCOCCAL', '06 Mar 2011 @ 0900'),
 ('PNEUMOVAX', '06 Mar 2011 @ 0900'),
 ('TETANUS', '07 Dec 2012 @ 1155'),
 ('Tetanus', '07 Jan 2013'),
 ('Tetanus', '18 Jul 2010')]

In [7]:
vax_df = pd.DataFrame(results, columns=['vaccine', 'vax_date'])

In [8]:
vax_df

Unnamed: 0,vaccine,vax_date
0,"INFLUENZA-H1N1-09,",07 Dec 2012 @ 1155
1,"INFLUENZA-H1N1-09,",01 Oct 2012 @ 1200
2,PNEUMOCOCCAL,06 Mar 2011 @ 0900
3,PNEUMOVAX,06 Mar 2011 @ 0900
4,TETANUS,07 Dec 2012 @ 1155
5,Tetanus,07 Jan 2013
6,Tetanus,18 Jul 2010


In [9]:
def strip_extra(s):
    if "@" in s:
        cutoff = s.index('@')
        s = s[:cutoff]
    return s

In [10]:
vax_df['date'] = vax_df.vax_date.apply(lambda x: pd.Timestamp(strip_extra(x)))

In [11]:
vax_df

Unnamed: 0,vaccine,vax_date,date
0,"INFLUENZA-H1N1-09,",07 Dec 2012 @ 1155,2012-12-07
1,"INFLUENZA-H1N1-09,",01 Oct 2012 @ 1200,2012-10-01
2,PNEUMOCOCCAL,06 Mar 2011 @ 0900,2011-03-06
3,PNEUMOVAX,06 Mar 2011 @ 0900,2011-03-06
4,TETANUS,07 Dec 2012 @ 1155,2012-12-07
5,Tetanus,07 Jan 2013,2013-01-07
6,Tetanus,18 Jul 2010,2010-07-18


In [12]:
vax_df['clinically_documented'] = vax_df.vax_date.apply(lambda x: '@' in x)

In [13]:
vax_df.sort_values('date', inplace=True)

In [14]:
vax_df

Unnamed: 0,vaccine,vax_date,date,clinically_documented
6,Tetanus,18 Jul 2010,2010-07-18,False
2,PNEUMOCOCCAL,06 Mar 2011 @ 0900,2011-03-06,True
3,PNEUMOVAX,06 Mar 2011 @ 0900,2011-03-06,True
1,"INFLUENZA-H1N1-09,",01 Oct 2012 @ 1200,2012-10-01,True
0,"INFLUENZA-H1N1-09,",07 Dec 2012 @ 1155,2012-12-07,True
4,TETANUS,07 Dec 2012 @ 1155,2012-12-07,True
5,Tetanus,07 Jan 2013,2013-01-07,False


In [15]:
vax_df['vaccine_matching'] = vax_df.vaccine.apply(lambda x: x[:6].upper())

In [16]:
vax_df.set_index('vaccine_matching')

Unnamed: 0_level_0,vaccine,vax_date,date,clinically_documented
vaccine_matching,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TETANU,Tetanus,18 Jul 2010,2010-07-18,False
PNEUMO,PNEUMOCOCCAL,06 Mar 2011 @ 0900,2011-03-06,True
PNEUMO,PNEUMOVAX,06 Mar 2011 @ 0900,2011-03-06,True
INFLUE,"INFLUENZA-H1N1-09,",01 Oct 2012 @ 1200,2012-10-01,True
INFLUE,"INFLUENZA-H1N1-09,",07 Dec 2012 @ 1155,2012-12-07,True
TETANU,TETANUS,07 Dec 2012 @ 1155,2012-12-07,True
TETANU,Tetanus,07 Jan 2013,2013-01-07,False


In [17]:
vax_df.groupby('vaccine_matching').date.apply(lambda x: [l.days for l in list(x - x.shift())[1:]])

vaccine_matching
INFLUE         [67]
PNEUMO          [0]
TETANU    [873, 31]
Name: date, dtype: object

## Medication reminders

In [18]:
search_string = "Medication:\s+(.*?)\n+.*?Instructions:\s+(.*?)\n+.*?Status:\s+(.*?)\n+.*?Refills Remaining:\s+(.*?)\nLast Filled On:\s+(.*?)\nInitially Ordered On:\s+(.*?)\nQuantity:\s+(.*?)\nDays Supply:\s+(.*?)$"
results = re.findall(search_string, full_doc, re.MULTILINE)

In [20]:
medications = pd.DataFrame(results, columns = ['medication', 'directions', 'status', 'refills_remaining', 'last_filled_on', 'initially_ordered_on', 'quantity', 'days_supply'])

In [21]:
medications = medications.apply(lambda x: x.astype(str).str.upper())

In [22]:
medications.head()

Unnamed: 0,medication,directions,status,refills_remaining,last_filled_on,initially_ordered_on,quantity,days_supply
0,ASPIRIN 81MG EC TAB,TAKE ONE TABLET BY MOUTH EVERY DAY,ACTIVE,3,14 OCT 2014,14 OCT 2014,30,30
1,ASPIRIN 325MG EC TAB,TAKE ONE TABLET BY MOUTH EVERY DAY,ACTIVE,11,24 SEP 2014,25 AUG 2014,1,1
2,AMLODIPINE BESYLATE 5MG TAB,TAKE ONE-HALF TABLET BY MOUTH EVERY DAY FOR BL...,EXPIRED,2,01 MAR 2013,10 DEC 2012,45,90
3,DONEPEZIL HCL 5MG TAB,TAKE ONE TABLET BY MOUTH EVERY MORNING,EXPIRED,10,16 FEB 2013,10 DEC 2012,30,30
4,HCTZ 25/TRIAMTERENE 37.5MG TAB,TAKE ONE-HALF TABLET (12.5/18.75 MG) BY MOUTH ...,EXPIRED,3,11 DEC 2012,10 DEC 2012,45,90


In [23]:
medications[medications.status == 'ACTIVE']

Unnamed: 0,medication,directions,status,refills_remaining,last_filled_on,initially_ordered_on,quantity,days_supply
0,ASPIRIN 81MG EC TAB,TAKE ONE TABLET BY MOUTH EVERY DAY,ACTIVE,3,14 OCT 2014,14 OCT 2014,30,30
1,ASPIRIN 325MG EC TAB,TAKE ONE TABLET BY MOUTH EVERY DAY,ACTIVE,11,24 SEP 2014,25 AUG 2014,1,1


In [24]:
re.findall("EVERY\s(.*)", "TAKE ONE TABLET BY MOUTH EVERY DAY")[0]

'DAY'

In [25]:
medications['time_delta'] = medications.directions.apply(lambda x: re.findall("EVERY\s(.*)", x))

In [26]:
medications[medications.status == 'ACTIVE'][['directions', 'time_delta']]

Unnamed: 0,directions,time_delta
0,TAKE ONE TABLET BY MOUTH EVERY DAY,[DAY ]
1,TAKE ONE TABLET BY MOUTH EVERY DAY,[DAY ]


In [29]:
pd.date_range('now', periods=10, freq="12H")

DatetimeIndex(['2021-12-12 20:00:34.296197', '2021-12-13 08:00:34.296197',
               '2021-12-13 20:00:34.296197', '2021-12-14 08:00:34.296197',
               '2021-12-14 20:00:34.296197', '2021-12-15 08:00:34.296197',
               '2021-12-15 20:00:34.296197', '2021-12-16 08:00:34.296197',
               '2021-12-16 20:00:34.296197', '2021-12-17 08:00:34.296197'],
              dtype='datetime64[ns]', freq='12H')