# Data Cleaning: MEPS Medical Conditions Data (2014-2019)

## Import packages and read data

In [92]:
# Import statements
import pandas as pd
import numpy as np

# To show all lines of output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [93]:
# Reading in the data
data_raw = pd.read_excel('h199.xlsx')

## View the data

In [94]:
# Describing the data
data_raw.describe()
data_raw.columns

# Filter out only columns we need
df = data_raw[['DUPERSID', 'CONDN', 'INJURY']]

Unnamed: 0,DUID,PID,DUPERSID,CONDN,CONDIDX,PANEL,CONDRN,AGEDIAG,CRND1,CRND2,...,ACCDNWRK,HHNUM,IPNUM,OPNUM,OBNUM,ERNUM,RXNUM,PERWT17F,VARSTR,VARPSU
count,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,...,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0,112630.0
mean,54395.458599,103.138471,54395560.0,64.149925,543955600000.0,21.494442,2.122587,7.899689,-0.266465,-0.216346,...,-0.841108,0.111675,0.031484,0.149383,1.687765,0.064796,1.285918,10850.060774,1565.930827,1.626503
std,40149.73144,13.468809,40149730.0,55.829979,401497300000.0,0.499971,1.223972,20.46945,0.820764,0.864854,...,0.73724,0.945474,0.208545,1.476622,5.801767,0.30386,1.889144,8009.562354,487.947178,0.641028
min,10001.0,101.0,10001100.0,11.0,100011000000.0,21.0,1.0,-9.0,-1.0,-1.0,...,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1001.0,1.0
25%,14667.0,101.0,14667100.0,21.0,146671000000.0,21.0,1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,5602.93077,1086.0,1.0
50%,19569.0,101.0,19569100.0,51.0,195691000000.0,21.0,2.0,-1.0,-1.0,-1.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,9112.861179,1163.0,2.0
75%,94866.0,102.0,94866100.0,91.0,948661000000.0,22.0,3.0,-1.0,0.0,1.0,...,-1.0,0.0,0.0,0.0,1.0,0.0,2.0,13760.992781,2056.0,2.0
max,99692.0,503.0,99692100.0,581.0,996921000000.0,22.0,5.0,85.0,1.0,1.0,...,3.0,43.0,9.0,140.0,305.0,11.0,40.0,98538.075709,2117.0,3.0


Index(['DUID', 'PID', 'DUPERSID', 'CONDN', 'CONDIDX', 'PANEL', 'CONDRN',
       'AGEDIAG', 'CRND1', 'CRND2', 'CRND3', 'CRND4', 'CRND5', 'INJURY',
       'ACCDNWRK', 'ICD10CDX', 'HHNUM', 'IPNUM', 'OPNUM', 'OBNUM', 'ERNUM',
       'RXNUM', 'PERWT17F', 'VARSTR', 'VARPSU'],
      dtype='object')

In [96]:
# List of column names
list(df.columns)

# Number of unique IDs
df['DUPERSID'].nunique()

# Length of original dataset
len(df)

# Check value counts of injury
df['INJURY'].value_counts(dropna=False)

['DUPERSID', 'CONDN', 'INJURY']

23936

112630

2    105570
1      7060
Name: INJURY, dtype: int64

## Data cleaning

#### Recoding Injury from 1/2 to 1/0

In [126]:
# Change INJURY from 1/2 to 1/0
df.loc[df['INJURY'] == 2] = 0

# Check recoded value counts of injury - good
df['INJURY'].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['INJURY'] == 2] = 0


0    105570
1      7060
Name: INJURY, dtype: int64

#### Aggregating data into a new dataframe

In [168]:
# Aggregate data by DUPERSID
df_agg = df.groupby(by="DUPERSID", as_index=False).agg(
    # Create new vars
    NUM_CONDITIONS = ('CONDN', 'count'),
    INJURY2 = ('INJURY', 'max')) # HERE IS THE PROBLEM

#### Checking data cleaning

In [169]:
# Check new column names
list(df_agg.columns)

# Check length of agg df = # of unique ID's
len(df_agg)
df_agg['DUPERSID'].nunique()

# Check that # unique IDs stayed the same in agg df
df['DUPERSID'].nunique() == df_agg['DUPERSID'].nunique()

# Check that sum of # conditions = number of rows of original df
sum(df_agg["NUM_CONDITIONS"]) == len(df)

# See frequencies of "INJURY"
df_agg['INJURY2'].value_counts()

['DUPERSID', 'NUM_CONDITIONS', 'INJURY2']

5031

5031

True

True

1    5030
0       1
Name: INJURY2, dtype: int64

In [164]:
# Checking against original injury var
# Num of 1's should be 3804

test = df.loc[df['INJURY'] == 1]
test.drop_duplicates(subset ="DUPERSID",
                     keep = False, inplace = True)
len(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop_duplicates(subset ="DUPERSID",


3804

## Next To Do

In [None]:
# ADD DUPERSID YEAR CODE
# DO FOR ALL YEARS

# CHECK NA's FOR ALL YEAR DATSETS FOR OUR COLUMNS