In [1]:
import pandas as pd

In [2]:
med_data = pd.read_csv('../../data/claims_med_interns_2016-2018_20190304.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
med = med_data[0:300]

In [4]:
# Dict: county -> urban/rural
urban_rural = {'ADAMS': 'rural', 'ASOTIN': 'rural', 'BENTON': 'urban', 'CHELAN': 'rural', 'CLALLAM': 'rural', 'CLARK': 'urban', 'COLUMBIA': 'rural', 'COWLITZ': 'rural', 'DOUGLAS': 'rural', 'FERRY': 'rural', 'FRANKLIN': 'rural', 'GARFIELD': 'rural', 'GRANT': 'rural', 'GRAYS HARBOR': 'rural', 'ISLAND': 'rural', 'JEFFERSON': 'rural', 'KING': 'urban', 'KITSAP': 'urban', 'KITTITAS': 'rural', 'KLICKITAT': 'rural', 'LEWIS': 'rural', 'LINCOLN': 'rural', 'MASON': 'rural', 'OKANOGAN': 'rural', 'PACIFIC': 'rural', 'PEND OREILLE': 'rural', 'PIERCE': 'urban', 'SAN JUAN': 'rural', 'SKAGIT': 'rural', 'SKAMANIA': 'rural', 'SNOHOMISH': 'urban', 'SPOKANE': 'urban', 'STEVENS': 'rural', 'THURSTON': 'urban', 'WAHKIAKUM': 'rural', 'WALLA WALLA': 'rural', 'WHATCOM': 'rural', 'WHITMAN': 'rural', 'YAKIMA': 'rural'}

In [5]:
def reshape(group):
    df = pd.DataFrame()

    # Demographic info
    df['member_id'] = group.head(1)['Member ID Encrypted']
    df['year'] = group.head(1)['Incurred Year']
    df['gender'] = group.head(1)['Member Gender']
    df['age'] = group.head(1)['Age']
    df['postal_code'] = group.head(1)['Zip (5-digit)']
    county = group.head(1)['County'].iloc[0]
    df['county'] = county
    if county in urban_rural:
        df['urban_rural'] = urban_rural[county]
    
    # Get all ICD codes and Rollups for this group
    icd_codes = group['Primary ICD Diagnosis Code'] + group['2nd ICD Diagnosis Code'] + group['3rd ICD Diagnosis Code'] + group['4th ICD Diagnosis Code'] + group['5th ICD Diagnosis Code'] + group['6th ICD Diagnosis Code'] + group['7th ICD Diagnosis Code'] + group['8th ICD Diagnosis Code'] + group['9th ICD Diagnosis Code'] + group['10th ICD Diagnosis Code']
    icd_rollups = group['Primary ICD Rollup'] + group['2nd ICD Rollup'] + group['3rd ICD Rollup'] + group['4th ICD Rollup'] + group['5th ICD Rollup'] + group['6th ICD Rollup'] + group['7th ICD Rollup'] + group['8th ICD Rollup'] + group['9th ICD Rollup'] + group['10th ICD Rollup']
    
    # Set binary for conditions: 1 if condition exists, 0 otherwise
    df['binary_asthma'] = 1 if any('Asthma' in x for x in icd_rollups) else 0
    df['binary_bp'] = 1 if any('I10' in x for x in icd_codes) else 0
    df['binary_cancer'] = 1 if any('Cancer' in x for x in icd_rollups) else 0
    df['binary_cardiovascular'] = 1 if any('heart' in x for x in icd_rollups) else 0
    df['binary_copd'] = 1 if any('Chronic obstructive pulmonary disease' in x for x in icd_rollups) else 0
    df['binary_diabetes'] = 1 if any('Diabetes' in x for x in icd_rollups) else 0
    df['binary_kidney'] = 1 if any('N18' in x for x in icd_codes) else 0
    df['binary_obesity'] = 1 if any('E66' in x for x in icd_codes) else 0
    df['binary_musculoskeletal'] = 1 if any('musculoskeletal' in s for s in icd_rollups) else 0
    df['binary_cholesterol'] = 1 if any('E78' in x for x in icd_codes) else 0
    #doesn't contain Bipolar disorders, Eating disorders, Psychogenic disorders, Other miscellaneous mental conditions, 
                                    #Codes related to mental health disorders, Dissociative diorders, Somatoform disorders
    mental_health = ['Anxiety disorders', 'Adjustment disorders', 'Mood disorders', 
                     'Schizophrenia and other psychotic disorders', 
                     'Suicide and intentional self-inflicted injury', 
                     'Impulse control disorders not elsewhere classified', 
                     'Screening and history of mental health and substance abuse codes', 
                    'Miscellaneous mental disorders']
    df['binary_mental_disorder'] = 1 if any(x in mental_health for x in icd_rollups) or any('G4700' in x for x in icd_codes) else 0
    
    # Initialize count columns
    df['claims_asthma'] = 0
    df['claims_bp'] = 0
    df['claims_cancer'] = 0
    df['claims_cardiovascular'] = 0
    df['claims_copd'] = 0
    df['claims_diabetes'] = 0
    df['claims_kidney'] = 0
    df['claims_obesity'] = 0
    df['claims_musculoskeletal'] = 0
    df['claims_cholesterol'] = 0
    df['claims_mental_disorder'] = 0
    df['total_claims'] = 0
    
    # Group by Claim ID to get unique claim counts
    claim_grouped = group.groupby(['Claim ID'], as_index=False)
    for claim_id, claim_group in claim_grouped:
        claim_reshape(claim_group, df)

    # Total number of conditions
    total_conditions = df['binary_kidney'] + df['binary_mental_disorder'] + df['binary_bp'] + df['binary_cardiovascular'] + df['binary_diabetes'] + df['binary_obesity'] + df['binary_cancer'] + df['binary_musculoskeletal'] + df['binary_copd'] + df['binary_cholesterol']
    df['total_conditions'] = total_conditions
    
    # Flags for nonused (claims were not filed for our conditions) and comorbidity (has more than one condition out of the ones we are interested in)
    df['flag_nonused'] = df['total_claims'].apply(lambda x: 1 if x == 0 else 0)
    df['flag_comorbidity'] = total_conditions.apply(lambda x: 1 if x > 1 else 0)

    return df

In [6]:
def claim_reshape(claim_group, df): 
    # Lists of all codes and rollups
    icd_codes = claim_group['Primary ICD Diagnosis Code'] + claim_group['2nd ICD Diagnosis Code'] + claim_group['3rd ICD Diagnosis Code'] + claim_group['4th ICD Diagnosis Code'] + claim_group['5th ICD Diagnosis Code'] + claim_group['6th ICD Diagnosis Code'] + claim_group['7th ICD Diagnosis Code'] + claim_group['8th ICD Diagnosis Code'] + claim_group['9th ICD Diagnosis Code'] + claim_group['10th ICD Diagnosis Code']
    icd_rollups = claim_group['Primary ICD Rollup'] + claim_group['2nd ICD Rollup'] + claim_group['3rd ICD Rollup'] + claim_group['4th ICD Rollup'] + claim_group['5th ICD Rollup'] + claim_group['6th ICD Rollup'] + claim_group['7th ICD Rollup'] + claim_group['8th ICD Rollup'] + claim_group['9th ICD Rollup'] + claim_group['10th ICD Rollup']
       
    if any('Asthma' in x for x in icd_rollups):
        df['claims_asthma'] += 1  
    if any('I10' in x for x in icd_codes):
        df['claims_bp'] += 1
    if any('Cancer' in x for x in icd_rollups):
        df['claims_cancer'] += 1
    if any('heart' in x for x in icd_rollups):
        df['claims_cardiovascular'] += 1
    if any('Chronic obstructive pulmonary disease' in x for x in icd_rollups):
        df['claims_copd'] += 1
    if any('Diabetes' in x for x in icd_rollups):
        df['claims_diabetes'] += 1
    if any('N18' in x for x in icd_codes):
        df['claims_kidney'] += 1
    if any('E66' in x for x in icd_codes):
        df['claims_obesity'] += 1
    if any('E78' in x for x in icd_codes):
        df['claims_cholesterol'] += 1
    if any('musculoskeletal' in x for x in icd_rollups):
        df['claims_musculoskeletal'] += 1
    mental_health = ['Anxiety disorders', 'Adjustment disorders', 'Mood disorders', 
                     'Schizophrenia and other psychotic disorders', 
                     'Suicide and intentional self-inflicted injury', 
                     'Impulse control disorders not elsewhere classified', 
                     'Screening and history of mental health and substance abuse codes', 
                    'Miscellaneous mental disorders']
    if any(x in mental_health for x in icd_rollups) or any('G4700' in x for x in icd_codes):
        df['claims_mental_disorder'] += 1

    df['total_claims'] += 1

In [7]:
# Reshape data by grouping by member and year
reshaped_med_data = med_data.groupby(['Member ID Encrypted', 'Incurred Year'], as_index=False).apply(reshape)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  result = concat(values, axis=self.axis, keys=keys)


In [10]:
columns = ['member_id', 'year', 'age', 'gender', 'postal_code', 'county', 'urban_rural', 'binary_asthma', 'binary_bp', 'binary_cancer', 'binary_cardiovascular', 'binary_cholesterol', 'binary_copd', 'binary_diabetes', 'binary_kidney', 'binary_mental_disorder', 'binary_musculoskeletal', 'binary_obesity', 'claims_asthma', 'claims_bp', 'claims_cancer', 'claims_cardiovascular', 'claims_cholesterol', 'claims_copd', 'claims_diabetes', 'claims_kidney', 'claims_mental_disorder', 'claims_musculoskeletal', 'claims_obesity', 'flag_comorbidity', 'flag_nonused', 'total_claims', 'total_conditions']

In [11]:
reshaped_med_data = reshaped_med_data[columns]

In [12]:
reshaped_med_data.to_csv('../../data/reshaped_med.csv', index=False)

In [13]:
reshaped_med_data.iloc[:,0:16]

Unnamed: 0,Unnamed: 1,member_id,year,age,gender,postal_code,county,urban_rural,binary_asthma,binary_bp,binary_cancer,binary_cardiovascular,binary_cholesterol,binary_copd,binary_diabetes,binary_kidney,binary_mental_disorder
0,0,710087KT90EY0DZUV,2018,47,F,98335,PIERCE,urban,1,0,0,0,0,1,0,0,0
1,13,71008ADZ2VBBDM14C,2016,54,F,98032,KING,urban,0,0,0,0,0,0,0,0,0
2,14,7100UNBYH5SYEX19X,2016,63,F,98926,KITTITAS,rural,0,0,0,0,0,0,0,0,0
3,21,7100UNBYH5SYEX19X,2018,65,F,98926,KITTITAS,rural,0,0,0,0,0,0,0,0,0
4,22,71069K607Y1CQ9BV5,2016,70,M,98059,KING,urban,0,0,0,0,0,0,0,0,0
5,27,71069K607Y1CQ9BV5,2017,71,M,98059,KING,urban,0,1,0,1,0,1,1,0,0
6,88,71069K607Y1CQ9BV5,2018,72,M,98059,KING,urban,0,1,0,1,1,1,1,0,0
7,246,7107GY7V3ZWGK7FKP,2016,57,F,98520,GRAYS HARBOR,rural,0,1,0,0,1,0,0,0,0
8,310,7107GY7V3ZWGK7FKP,2017,58,F,98520,GRAYS HARBOR,rural,0,0,0,0,0,0,0,0,0
9,311,7107NGE7PLEN7UMDY,2016,26,F,98030,KING,urban,0,0,0,0,0,0,0,0,0


In [14]:
pd.read_csv('../../data/reshaped_med.csv')

Unnamed: 0,member_id,year,age,gender,postal_code,county,urban_rural,binary_asthma,binary_bp,binary_cancer,...,claims_copd,claims_diabetes,claims_kidney,claims_mental_disorder,claims_musculoskeletal,claims_obesity,flag_comorbidity,flag_nonused,total_claims,total_conditions
0,710087KT90EY0DZUV,2018,47,F,98335,PIERCE,urban,1,0,0,...,1,0,0,0,0,1,1,0,1,2
1,71008ADZ2VBBDM14C,2016,54,F,98032,KING,urban,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,7100UNBYH5SYEX19X,2016,63,F,98926,KITTITAS,rural,0,0,0,...,0,0,0,0,0,0,0,0,3,0
3,7100UNBYH5SYEX19X,2018,65,F,98926,KITTITAS,rural,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,71069K607Y1CQ9BV5,2016,70,M,98059,KING,urban,0,0,0,...,0,0,0,0,0,0,0,0,2,0
5,71069K607Y1CQ9BV5,2017,71,M,98059,KING,urban,0,1,0,...,2,4,0,0,0,0,1,0,4,4
6,71069K607Y1CQ9BV5,2018,72,M,98059,KING,urban,0,1,0,...,3,4,0,0,0,0,1,0,20,5
7,7107GY7V3ZWGK7FKP,2016,57,F,98520,GRAYS HARBOR,rural,0,1,0,...,0,0,0,0,0,0,1,0,6,2
8,7107GY7V3ZWGK7FKP,2017,58,F,98520,GRAYS HARBOR,rural,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,7107NGE7PLEN7UMDY,2016,26,F,98030,KING,urban,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
# Tests if the number of claims for both reshaped and original(actual) are the same
def test(memberid, year):
    reshaped_claims = reshaped_med_data.loc[(reshaped_med_data['member_id'] == memberid) & 
                      (reshaped_med_data['year'] == year),].total_claims.item()
    print('Reshaped number of claims: ', reshaped_claims)
    
    actual = med.loc[(med['Member ID Encrypted'] == memberid) & (med['Incurred Year'] == year),]
    print('Actual number of claims: ', len(actual['Claim ID'].unique()))
    
    print(reshaped_claims == len(actual['Claim ID'].unique()))

In [10]:
test('71069K607Y1CQ9BV5', 2016)

Reshaped number of claims:  2
Actual number of claims:  2
True


In [11]:
test('71069K607Y1CQ9BV5', 2017)

Reshaped number of claims:  4
Actual number of claims:  4
True


In [12]:
test('71069K607Y1CQ9BV5', 2018)

Reshaped number of claims:  20
Actual number of claims:  20
True
