# Preparing Health Records

This step imports, cleans, and transforms a decade of health records into a single clean dataframe for later analysis.

In [1]:
import pandas as pd
import datetime as dt

Import reports and store in a dictionary of dataframes

In [2]:
# initialize dictionary
years = list(range(2013, 2020))
report_dict = dict.fromkeys(years, 0)
print(report_dict)

{2013: 0, 2014: 0, 2015: 0, 2016: 0, 2017: 0, 2018: 0, 2019: 0}


In [3]:
# list columns to import
columns = ['personID', 'treatmentDate', 'ailment', 'notification', 'treatment', 'temperature', 'isCamper', 'SeasonTotal']

# load dataframes into dictionary
for key in report_dict:
    filename = f'C:\\Users\\avery\\OneDrive\\health_database_docs\\raw_report_{key}.csv'
    df = pd.read_csv(filename, usecols=columns)
    report_dict[key] = df

In [4]:
print(report_dict[2015].head(1))

   personID    treatmentDate                         ailment  notification  \
0   5788673  7/23/2015 18:55  repeat lice check per schedule  Not Notified   

                                       treatment temperature  isCamper  \
0  Ibuprofen, no lice found, no treatment needed         NaN         1   

   SeasonTotal  
0            1  


Combine all dataframes in the dictionary together and check the result shape.

In [5]:
combined_df = pd.concat(report_dict.values(), axis=0, join='outer')
print(combined_df.shape)

(3798, 8)


Transform columns for more efficient use later:
1. replace "isCamper" with boolean "is_camper"
2. replace "notification" with boolean "home_notified"
3. change "treatmentDate" to "report_date" and "report_time"
4. rename "personID" to "person_id", "SeasonTotal" to "season_total"

In [6]:
combined_df['is_camper'] = combined_df['isCamper'].astype('bool')
print(combined_df.dtypes)
print(combined_df.head(1))

personID           int64
treatmentDate     object
ailment           object
notification      object
treatment         object
temperature       object
isCamper         float64
SeasonTotal        int64
is_camper           bool
dtype: object
   personID    treatmentDate   ailment  notification  \
0   3955392  6/11/2013 21:22  Headache  Not Notified   

                                           treatment temperature  isCamper  \
0  Ibuprofen, 400mg ibuprofen given that parents ...         NaN       1.0   

   SeasonTotal  is_camper  
0            1       True  


In [7]:
print(combined_df['notification'].value_counts())

Not Notified      3431
Notified           285
Not Applicable      72
Left Voicemail       7
Unavailable          1
Name: notification, dtype: int64


In [8]:
notified_map = {'Notified': True, 
                'Left Voicemail': True,
                'Unavailable': True,
                'Not Notified': False, 
                'Not Applicable': False}

# remap categories to True / False and store in new column
combined_df['home_notified'] = combined_df['notification'].map(notified_map)

# change data type to boolean 
combined_df['home_notified'] = combined_df['home_notified'].astype('bool')

print(combined_df.dtypes)

personID           int64
treatmentDate     object
ailment           object
notification      object
treatment         object
temperature       object
isCamper         float64
SeasonTotal        int64
is_camper           bool
home_notified       bool
dtype: object


Rename a few other columns.

In [9]:
new_names = {'personID':'person_id',
            'temperature': 'temp',
            'SeasonTotal': 'season_visits',
            'ailment': 'ailment_text',
            'treatment': 'treatment_text'}

combined_df.rename(new_names, axis=1, inplace=True)

print(combined_df.columns)

Index(['person_id', 'treatmentDate', 'ailment_text', 'notification',
       'treatment_text', 'temp', 'isCamper', 'season_visits', 'is_camper',
       'home_notified'],
      dtype='object')


Change "treatmentDate" to datetime, then split into separate date and time columns.

In [10]:
combined_df['timestamp'] = pd.to_datetime(combined_df['treatmentDate'])
combined_df['report_date'] = combined_df['timestamp'].dt.date
combined_df['report_time'] = combined_df['timestamp'].dt.time

print(combined_df[['report_date', 'report_time']])

    report_date report_time
0    2013-06-11    21:22:00
1    2013-06-19    08:12:00
2    2013-06-23    10:45:00
3    2013-06-26    21:25:00
4    2013-08-01    17:54:00
..          ...         ...
653  2019-06-19    13:45:00
654  2019-06-19    19:45:00
655  2019-06-20    21:00:00
656  2019-06-23    14:25:00
657  2019-06-28    09:04:00

[3798 rows x 2 columns]


Check output and export.

In [11]:
print(combined_df.columns)

Index(['person_id', 'treatmentDate', 'ailment_text', 'notification',
       'treatment_text', 'temp', 'isCamper', 'season_visits', 'is_camper',
       'home_notified', 'timestamp', 'report_date', 'report_time'],
      dtype='object')


In [13]:
final_df = combined_df[['person_id', 'report_date', 'report_time', 'timestamp', 'ailment_text', 'treatment_text', 'temp', 'season_visits', 'is_camper', 'home_notified']]
final_df.to_csv('C:\\Users\\avery\\OneDrive\\health_database_docs\\combined_health_records.csv')