In [3]:
import pandas as pd

In [513]:
med_data = pd.read_csv('../../data/claims_med_interns_2016-2018_20190304.csv')
#med_data.columns

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
med = med_data[0:300]

In [10]:
columns = ['member_id', 'year', 'gender', 'age', 'postal_code', 'county', 'urban_rural', 'binary_asthma']

In [584]:
def reshape(group):
    df = pd.DataFrame()
    
    # Demographic info
    df['member_id'] = group.head(1)['Member ID Encrypted']
    df['year'] = group.head(1)['Incurred Year']
    df['gender'] = group.head(1)['Member Gender']
    df['age'] = group.head(1)['Age']
    df['postal_code'] = group.head(1)['Zip (5-digit)']
    df['county'] = group.head(1)['County']
    
    # Initialize count columns
    df['claims_hypertension'] = 0
    df['claims_cardiovascular'] = 0
    df['claims_diabetes'] = 0
    df['claims_obesity'] = 0
    df['claims_cancer'] = 0
    df['claims_musculoskeletal'] = 0
    df['claims_copd'] = 0
    df['claims_mental_disorder'] = 0
    df['claims_cholesterol'] = 0
    df['total_claims'] = 0
    
    # Get all ICD codes and Rollups for this group
    icd_codes = group['Primary ICD Diagnosis Code'] + group['2nd ICD Diagnosis Code'] + group['3rd ICD Diagnosis Code'] + group['4th ICD Diagnosis Code'] + group['5th ICD Diagnosis Code'] + group['6th ICD Diagnosis Code'] + group['7th ICD Diagnosis Code'] + group['8th ICD Diagnosis Code'] + group['9th ICD Diagnosis Code'] + group['10th ICD Diagnosis Code']
    icd_rollups = group['Primary ICD Rollup'] + group['2nd ICD Rollup'] + group['3rd ICD Rollup'] + group['4th ICD Rollup'] + group['5th ICD Rollup'] + group['6th ICD Rollup'] + group['7th ICD Rollup'] + group['8th ICD Rollup'] + group['9th ICD Rollup'] + group['10th ICD Rollup']
    
    # Set binary for conditions: 1 if condition exists, 0 otherwise
    df['binary_hypertension'] = 1 if any('I10' in s for s in icd_codes) else 0
    df['binary_cardiovascular'] = 1 if any('heart' in s for s in icd_rollups) else 0
    df['binary_diabetes'] = 1 if any('Diabetes' in s for s in icd_rollups) else 0
    df['binary_obesity'] = 1 if any('E66' in s for s in icd_codes) else 0
    df['binary_cancer'] = 1 if any('Cancer' in s for s in icd_rollups) else 0
    df['binary_musculoskeletal'] = 1 if any('musculoskeletal' in s for s in icd_rollups) else 0
    df['binary_copd'] = 1 if any('Chronic obstructive pulmonary disease' in s for s in icd_rollups) else 0
    #doesn't contain Bipolar disorders, Eating disorders, Psychogenic disorders, Other miscellaneous mental conditions, 
                                    #Codes related to mental health disorders, Dissociative diorders, Somatoform disorders
    mental_health = ['Anxiety disorders', 'Adjustment disorders', 'Mood disorders', 
                     'Schizophrenia and other psychotic disorders', 
                     'Suicide and intentional self-inflicted injury', 
                     'Impulse control disorders not elsewhere classified', 
                     'Screening and history of mental health and substance abuse codes', 
                    'Miscellaneous mental disorders']
    df['binary_mental_disorder'] = 1 if any(x in mental_health for x in icd_rollups) else 0
    df['binary_cholesterol'] = 1 if any('E78' in s for s in icd_codes) else 0
    
    # Group by Claim ID to get unique claim counts
    group.groupby(['Claim ID'], as_index=False).apply(claim_reshape, df=df)

    # Total number of conditions
    total_conditions = df['binary_mental_disorder'] + df['binary_hypertension'] + df['binary_cardiovascular'] + df['binary_diabetes'] + df['binary_obesity'] + df['binary_cancer'] + df['binary_musculoskeletal'] + df['binary_copd'] + df['binary_cholesterol']
    df['total_conditions'] = total_conditions
    
    # Flags for nonused (claims were not filed for our conditions) and comorbidity (has more than one condition out of the ones we are interested in)
    df['flag_nonused'] = df['total_claims'].apply(lambda x: 1 if x == 0 else 0)
    df['flag_comorbidity'] = total_conditions.apply(lambda x: 1 if x > 1 else 0)

    return df

In [585]:
def claim_reshape(claim_group, df): 
    # Lists of all codes and rollups
    icd_codes = claim_group['Primary ICD Diagnosis Code'] + claim_group['2nd ICD Diagnosis Code'] + claim_group['3rd ICD Diagnosis Code'] + claim_group['4th ICD Diagnosis Code'] + claim_group['5th ICD Diagnosis Code'] + claim_group['6th ICD Diagnosis Code'] + claim_group['7th ICD Diagnosis Code'] + claim_group['8th ICD Diagnosis Code'] + claim_group['9th ICD Diagnosis Code'] + claim_group['10th ICD Diagnosis Code']
    icd_rollups = claim_group['Primary ICD Rollup'] + claim_group['2nd ICD Rollup'] + claim_group['3rd ICD Rollup'] + claim_group['4th ICD Rollup'] + claim_group['5th ICD Rollup'] + claim_group['6th ICD Rollup'] + claim_group['7th ICD Rollup'] + claim_group['8th ICD Rollup'] + claim_group['9th ICD Rollup'] + claim_group['10th ICD Rollup']
        
    print('Claim ID: ', claim_group.head(1)['Claim ID'])    
    print('total claims: ', df['total_claims'])
          
    if any('I10' in s for s in icd_codes):
        df['claims_hypertension'] += 1
    if any('heart' in s for s in icd_rollups):
        df['claims_cardiovascular'] += 1
    if any('Diabetes' in s for s in icd_rollups):
        df['claims_diabetes'] += 1
    if any('E66' in s for s in icd_codes):
        df['claims_obesity'] += 1
    if any('Cancer' in s for s in icd_rollups):
        df['claims_cancer'] += 1
    if any('musculoskeletal' in s for s in icd_rollups):
        df['claims_musculoskeletal'] += 1
    if any('Chronic obstructive pulmonary disease' in s for s in icd_rollups):
        df['claims_copd'] += 1
    if any('E78' in s for s in icd_codes):
        df['claims_cholesterol'] += 1
    mental_health = ['Anxiety disorders', 'Adjustment disorders', 'Mood disorders', 
                     'Schizophrenia and other psychotic disorders', 
                     'Suicide and intentional self-inflicted injury', 
                     'Impulse control disorders not elsewhere classified', 
                     'Screening and history of mental health and substance abuse codes', 
                    'Miscellaneous mental disorders']
    if any(x in mental_health for x in icd_rollups):
        df['claims_mental_disorder'] += 1

    df['total_claims'] += 1

In [586]:
# Reshape data by grouping by member and year
reshaped_med_data = med.groupby(['Member ID Encrypted', 'Incurred Year'], as_index=False).apply(reshape)

Claim ID:  0    180952D2AB000
Name: Claim ID, dtype: object
total claims:  0    0
Name: total_claims, dtype: int64
Claim ID:  0    180952D2AB000
Name: Claim ID, dtype: object
total claims:  0    1
Name: total_claims, dtype: int64
Claim ID:  0    180952D2AB000
Name: Claim ID, dtype: object
total claims:  0    0
Name: total_claims, dtype: int64
Claim ID:  0    180952D2AB000
Name: Claim ID, dtype: object
total claims:  0    1
Name: total_claims, dtype: int64
Claim ID:  13    16335#C2BB000
Name: Claim ID, dtype: object
total claims:  13    0
Name: total_claims, dtype: int64
Claim ID:  13    16335#C2BB000
Name: Claim ID, dtype: object
total claims:  13    1
Name: total_claims, dtype: int64
Claim ID:  14    1621824B33000
Name: Claim ID, dtype: object
total claims:  14    0
Name: total_claims, dtype: int64
Claim ID:  14    1621824B33000
Name: Claim ID, dtype: object
total claims:  14    1
Name: total_claims, dtype: int64
Claim ID:  15    162242#C53000
Name: Claim ID, dtype: object
total claim

In [587]:
reshaped_med_data.iloc[:,0:16]

Unnamed: 0,Unnamed: 1,member_id,year,gender,age,postal_code,county,claims_hypertension,claims_cardiovascular,claims_diabetes,claims_obesity,claims_cancer,claims_musculoskeletal,claims_copd,claims_mental_disorder,claims_cholesterol,total_claims
0,0,710087KT90EY0DZUV,2018,F,47,98335,PIERCE,0,0,0,2,0,0,2,0,0,2
1,13,71008ADZ2VBBDM14C,2016,F,54,98032,KING,0,0,0,0,0,0,0,0,0,2
2,14,7100UNBYH5SYEX19X,2016,F,63,98926,KITTITAS,0,0,0,0,0,0,0,0,0,4
3,21,7100UNBYH5SYEX19X,2018,F,65,98926,KITTITAS,0,0,0,0,0,0,0,0,0,2
4,22,71069K607Y1CQ9BV5,2016,M,70,98059,KING,0,0,0,0,0,0,0,0,0,3
5,27,71069K607Y1CQ9BV5,2017,M,71,98059,KING,2,5,5,0,0,0,2,0,0,5
6,88,71069K607Y1CQ9BV5,2018,M,72,98059,KING,5,6,5,0,0,0,4,0,6,21
7,246,7107GY7V3ZWGK7FKP,2016,F,57,98520,GRAYS HARBOR,1,0,0,0,0,0,0,0,1,6


In [588]:
# Tests if the number of claims for both reshaped and original(actual) are the same
def test(memberid, year):
    reshaped_claims = reshaped_med_data.loc[(reshaped_med_data['member_id'] == memberid) & 
                      (reshaped_med_data['year'] == year),].total_claims.item()
    print('Reshaped number of claims: ', reshaped_claims)
    
    actual = med.loc[(med['Member ID Encrypted'] == memberid) & (med['Incurred Year'] == year),]
    print('Actual number of claims: ', len(actual['Claim ID'].unique()))
    
    print(reshaped_claims == len(actual['Claim ID'].unique()))

In [571]:
test('71069K607Y1CQ9BV5', 2016)

Reshaped number of claims:  3
Actual number of claims:  2
False


In [572]:
test('71069K607Y1CQ9BV5', 2017)

Reshaped number of claims:  5
Actual number of claims:  4
False


In [573]:
test('71069K607Y1CQ9BV5', 2018)

Reshaped number of claims:  21
Actual number of claims:  20
False
