In [3]:
import polars as pl
from polars import col
pl.__version__

'0.13.43'

### Cleaning the hospitals dataset -- motivation

The hospitals dataset at DoltHub contains 300M rows of prices. 

Each price is given a couple of codes (a primary `code` and an `internal_revenue_code`) which refer to how the procedure is billed.

Participants typically put the most generic code in the `code` column. If there was a second code, it went in the `internal_revenue_code` column. A third code would have gone in the `code_disambiguator` column, but this turns out to not have been necessary.

However, because the codes are mixed -- some contain pharmacy codes, random codes, etc. -- it is a bit difficult to work with the data. Plus, it's not clear how many of our rows even have valid codes.

So I wrote a short cleaning pipeline to figure out the coding for each row, and see what fraction of rows were coded in some machine readable way.

In [885]:
cpt_marks   = ['CPT', 'HCPCS?/CPT', 'CPT/HCPCS?', 'HCPCS?']
drg_marks   = ['DRG', 'MS-DRG', 'MSD', 'MS']
ndc_marks   = ['NDC']
other_marks = ['CDM', 'PH\d+']

def list_to_regexp(items, b = True): 
    '''Create a regexp that matches any items in the list
    b = True: word boundaries included
    b = False: word boundaries excluded'''
    if b: return '|'.join([fr'\b{item}\b' for item in items])
    return '|'.join(items)

def in_column(colname, items, b = True): 
    '''Does the column contain any of the items?'''
    return col(colname).str.contains(list_to_regexp(items, b))

def extract_from(colname, items):
    '''Convenience function'''
    return col(f'{colname}').str.extract_all('|'.join(items))

def extract_cpt(df):
    '''
    Extract CPT singlet codes (i.e. not ranges) from a single column (usually 'code')
    '''
    
    # regexp to capture procedural terminology codes from 00100-99499
    # https://3widgets.com/: used this to generate the numeric aspect of the code
    
    # We make a column 'code_' where we'll put the partially cleaned data
    # The lack of positive/negative lookaheads in the rust regex parser
    # forces us to work in two passes.
    
    cpt_i_codes  = ['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}',
                    '0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}',
                    '99[0-4][0-9]{2}']

    cpt_ii_codes = ['[ACEGJKLMQTV]\d{4}', '\d{4}[ACEGJKLMQTV]']
    
    code_ranges = ['([ACEGJKLMQTV]|\d)\d{3}([ACEGJKLMQTV]|\d)\s?-\s?([ACEGJKLMQTV]|\d)\d{3}([ACEGJKLMQTV]|\d)']
    
    cpt_found = in_column('code_', cpt_i_codes + cpt_ii_codes)
    other_marks_found = in_column('code_', drg_marks + ndc_marks + other_marks)
    
    df_ = df.with_columns([
        
        # Erase CPT code ranges (like 00100-00111, which confuse the regex) and create new column
        
        col('code').str.replace_all(list_to_regexp(cpt_ranges), '').alias('code_')
    
    ]).with_columns([
        
        # Extract CPT codes from column with ranges erased
        # This extracts any code like 00100 or A1921
        
        pl.when(cpt_found & ~other_marks_found).then(
            extract_from('code_', cpt_codes + cpt_ii_codes)
        ).otherwise(None).alias('cpts_extracted'),
        
        # Sometimes CPT/HCPCS is marked in the column. 
        # In this case, extract the code that follows the marker
        # This extracts codes like CPT1234 --> 1234
        
        pl.when(in_column('code_', cpt_marks, b = False) & ~other_marks_found).then(
            col('code').str.extract(r'\b(?:CPT|HCPCS?/CPT|CPT/HCPCS?|HCPCS?)\s?([A-Z]?\d{3,5}[A-Z]?)\b', 1)
        ).otherwise(None).apply(lambda x: [x]).alias('cpts_extracted_')
        
    ])
    
    # This is a workaround for what (I believe) is a polars bug
    if any(df_['cpts_extracted_'].is_not_null()): 
        df_ = df_.with_column(
            col('cpts_extracted').fill_null(col('cpts_extracted_'))
        )
        
    return df_.drop(['code_', 'cpts_extracted_'])
        
def extract_drg(df):
    '''
    Extract DRG singlet codes (i.e. not ranges) from multiple columns.
    Searches principally 'code' column, followed by 'internal_revenue_code'
    '''
    
    newcolname = 'glob'
    other_marks_found = in_column(newcolname, cpt_marks + ndc_marks + other_marks)
    
    df_ = df.with_column(
        
        pl
        .concat_str(['code', 'internal_revenue_code'], '~')
        .str.replace_all('\\bFY\\b|\\b\d{4,}\\b|\\bV\d{2}\\b|\(|\)', '')
        .alias('glob')
        
    ).with_columns([
        
        # Extract DRG codes from code column when the 
        # code column simply contains a 3 digit number
        
        pl.when(
            col('code').str.contains(r'^\d{1,3}(-\d{1,2})?$')
        ).then(
            col('code').str.extract(r'^(\d{1,3})(-\d{1,2})?$', 1)
        ).otherwise(None).alias('drg_extracted'),
        
        # Sometimes DRG is marked in the column. 
        # In this case, extract the code that follows the marker
        
        pl.when(
            in_column('glob', drg_marks, b = False) & ~other_marks_found
        ).then(
            col('code').str.extract(r'\b(?:MS|MSD)?(?:-|\s)?(?:DRG)?(\d{1,3})\b', 1)
        ).otherwise(None).alias('drg_extracted_')

    ]).with_column(
        col('drg_extracted').fill_null(col('drg_extracted_'))
    ).drop(['drg_extracted_', 'glob'])
    
    return df_

def extract_ndc(df):
    '''
    Extract NDC singlet codes (i.e. not ranges) from multiple columns.
    Searches principally 'code' column, followed by 'internal_revenue_code'
    and 'description'
    
    See: https://en.wikipedia.org/wiki/National_drug_code
    '''
    
    newcolname = 'glob'
    ndc_codes = ['0?\d{4}-\d{4}-\d{2}0?','0?\d{5}-\d{3}-\d{2}0?','0?\d{5}-\d{4}-\d{1}0?']
    
    df_ = df.with_column(
        
        pl
        .concat_str(['code', 'internal_revenue_code', 'description'], '~')
        .alias('glob')
        
    ).with_columns([
        
        # Extract NDC codes from code column when the 
        # code column contains an NDC string
        
        pl.when(
            in_column('code', ndc_codes)
        ).then(
            col('code').str.extract(list_to_regexp(ndc_codes))
        ).otherwise(None).alias('ndc_extracted'),
        
        # Sometimes NDC is marked in the column. 
        # In this case, extract the code that follows the marker
        # Can use more generous searching
        
        pl.when(
            in_column('glob', ['NDC'], b = False)
        ).then(
            col('glob').str.extract(r'([0-9-]{12})', 1)
        ).otherwise(None).alias('ndc_extracted_')

    ]).with_column(
        col('ndc_extracted').fill_null(col('ndc_extracted_'))
    ).drop(['ndc_extracted_', 'glob'])
    
    return df_
    
def extract_cpt_range(df):
    '''
    Extract CPT ranges from the 'code' column
    '''
    
    code_ranges = ['([ACEGJKLMQTV]|\d)\d{3}([ACEGJKLMQTV]|\d)\s?-\s?([ACEGJKLMQTV]|\d)\d{3}([ACEGJKLMQTV]|\d)']
    
    df_ = df.with_columns([
        
        pl.when(
            in_column('code', code_ranges)
        ).then(
            col('code').str.extract_all(list_to_regexp(code_ranges))
        ).otherwise(None).alias('cpt_ranges_extracted'),
        
    ])
    
    return df_

In [902]:
lf = pl.scan_csv('../prices.csv', n_rows = 20_000_000, infer_schema_length = 0, encoding = 'utf8-lossy')

In [908]:
lf.fetch(2)

cms_certification_num,payer,code,internal_revenue_code,units,description,inpatient_outpatient,price,code_disambiguator
str,str,str,str,str,str,str,str,str
"""010001""","""BLUE ADVANTAGE...","""HCPCS 82441""","""3018244101""",,"""HC TEST FOR CH...","""UNSPECIFIED""","""279.02""","""NONE"""
"""010001""","""BLUE CROSS OF ...","""HCPCS 82441""","""3018244101""",,"""HC TEST FOR CH...","""UNSPECIFIED""","""279.02""","""NONE"""


In [889]:
df = (lf
      .collect()
      .pipe(extract_ndc)
      .pipe(extract_cpt)
      .pipe(extract_drg)
      .pipe(extract_cpt_range)
      .drop(['units', 'code_disambiguator'])
      .with_columns(
          pl.col(['cms_certification_num', 'inpatient_outpatient']).cast(pl.Categorical)
      )
     )

In [916]:
df.sample(5)

cms_certification_num,payer,code,internal_revenue_code,description,inpatient_outpatient,price,ndc_extracted,cpts_extracted,drg_extracted,cpt_ranges_extracted
cat,str,str,str,str,cat,str,str,list[str],str,list[str]
"""033302""","""HUMANA""","""72428376""","""0276""","""LENS AL MA30AC...","""UNSPECIFIED""","""122.26""",,,,
"""030038""","""HUMANA GOLD PL...","""NONE""","""260""","""IV THERAPY - G...","""UNSPECIFIED""","""411.00""",,,,
"""040119""","""HUMANACHOICEPP...","""82104""","""NONE""","""ALPHA-1-ANTITR...","""OUTPATIENT""","""0.00""",,"[""82104""]",,
"""010083""","""UHC""","""72081""","""NONE""","""SP-ENTIRE SPNE...","""UNSPECIFIED""","""402.20""",,"[""72081""]",,
"""033302""","""AETNA""","""72414757""","""0278""","""SCRW SHNZ 294....","""UNSPECIFIED""","""261.69""",,,,


In [891]:
g = df.filter(
    col('ndc_extracted').is_null()  & 
    col('cpts_extracted').is_null() &
    col('drg_extracted').is_null()  &
    col('cpt_ranges_extracted').is_null()
)

In [918]:
g.sample(5)

cms_certification_num,payer,code,internal_revenue_code,description,inpatient_outpatient,price,ndc_extracted,cpts_extracted,drg_extracted,cpt_ranges_extracted
cat,str,str,str,str,cat,str,str,list[str],str,list[str]
"""010011""","""CASH CHARGE""","""CDM70642941""","""302""","""FEE LCM AB IGM...","""UNSPECIFIED""","""13.44""",,,,
"""033302""","""AFMC""","""70143695""","""0278""","""SCREW CANCELLO...","""UNSPECIFIED""","""419.25""",,,,
"""033302""","""BLUE CROSS""","""72413877""","""0278""","""PLTE TUB 1 3 3...","""UNSPECIFIED""","""274.40""",,,,
"""040018""","""SIHO""","""NONE""","""360""","""HC SKIN SUB HE...","""INPATIENT""","""321.64""",,,,
"""013033""","""CAREWORKS (CRW...","""NONE""","""270""","""ESSENTIALCARE ...","""INPATIENT""","""9.60""",,,,


In [892]:
len(g)/len(df)

0.47415745