In [3]:
import polars as pl
from polars import col
pl.__version__

'0.13.43'

### Cleaning the hospitals dataset

In [849]:
cpt_marks   = ['CPT', 'HCPCS?/CPT', 'CPT/HCPCS?', 'HCPCS?']
drg_marks   = ['DRG', 'MS-DRG', 'MSD', 'MS']
ndc_marks   = ['NDC']
other_marks = ['CDM', 'PH\d+']

def list_to_regexp(items, b = True): 
    '''Create a regexp that matches any items in the list
    b = True: word boundaries included
    b = False: word boundaries excluded'''
    if b: return '|'.join([fr'\b{item}\b' for item in items])
    return '|'.join(items)

def in_column(colname, items, b = True): 
    '''Does the column contain any of the items?'''
    return col(colname).str.contains(list_to_regexp(items, b))

def extract_from(colname, items):
    '''Convenience function'''
    return col(f'{colname}').str.extract_all('|'.join(items))

def extract_cpt(df):
    '''
    Extract CPT singlet codes (i.e. not ranges) from a single column (usually 'code')
    '''
    
    # regexp to capture procedural terminology codes from 00100-99499
    # https://3widgets.com/: used this to generate the numeric aspect of the code
    
    cpt_i_codes  = ['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}',
                    '0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}',
                    '99[0-4][0-9]{2}']

    cpt_ii_codes = ['[ACEGJKLMQTV]\d{4}', '\d{4}[ACEGJKLMQTV]']
    
    code_ranges = ['[A-Z]?\d{3,5}[A-Z]?-[A-Z]?\d{3,5}[A-Z]?']
    
    # sometimes CPT codes come in ranges
    # cpt_ranges = [f'{init_code}\s?-\s?{final_code}'
    #                     for (i, init_code) in enumerate(proc_codes) \
    #                     for final_code in proc_codes[i+1:]]
    
    # Column where we'll put the partially cleaned data
    # The lack of positive/negative lookaheads in the regex parser
    # forces us to work in two passes.
        
    cpt_found = in_column('code_', cpt_i_codes + cpt_ii_codes)
    other_marks_found = in_column('code_', drg_marks + ndc_marks + other_marks)
    
    df_ = df.with_columns([
        
        # Erase CPT code ranges (like 00100-00111, which confuse the regex) and create new column
        
        col('code').str.replace_all(list_to_regexp(cpt_ranges), '').alias('code_')
    
    ]).with_columns([
        
        # Extract CPT codes from column with ranges erased
        # This extracts any code like 00100 or A1921
        
        pl.when(cpt_found & ~other_marks_found).then(
            extract_from('code_', cpt_codes + cpt_ii_codes)
        ).otherwise(None).alias('cpt_extracted'),
        
        # Sometimes CPT/HCPCS is marked in the column. 
        # In this case, extract the code that follows the marker
        # This extracts codes like CPT1234 --> 1234
        
        pl.when(in_column('code_', cpt_marks, b = False) & ~other_marks_found).then(
            col('code').str.extract(r'\b(?:CPT|HCPCS?/CPT|CPT/HCPCS?|HCPCS?)\s?([A-Z]?\d{3,5}[A-Z]?)\b', 1)
        ).otherwise(None).apply(lambda x: [x]).alias('cpt_extracted_')
        
    ])
    
    # This is a workaround for what (I believe) is a polars bug
    if any(df_['cpt_extracted_'].is_not_null()): 
        df_ = df_.with_column(
            col('cpt_extracted').fill_null(col('cpt_extracted_'))
        )
        
    return df_.drop(['code_', 'cpt_extracted_'])
        
def extract_drg(df):
    '''
    Extract DRG singlet codes (i.e. not ranges) from multiple columns.
    Searches principally 'code' column, followed by 'internal_revenue_code'
    '''
    
    newcolname = 'glob'
    other_marks_found = in_column(newcolname, cpt_marks + ndc_marks + other_marks)
    
    df_ = df.with_column(
        
        pl
        .concat_str(['code', 'internal_revenue_code'], '~')
        .str.replace_all('\\bFY\\b|\\b\d{4,}\\b|\\bV\d{2}\\b|\(|\)', '')
        .alias('glob')
        
    ).with_columns([
        
        # Extract DRG codes from code column when the 
        # code column simply contains a 3 digit number
        
        pl.when(
            col('code').str.contains(r'^\d{1,3}(-\d{1,2})?$')
        ).then(
            col('code').str.extract(r'^(\d{1,3})(-\d{1,2})?$', 1)
        ).otherwise(None).alias('drg_extracted'),
        
        # Sometimes DRG is marked in the column. 
        # In this case, extract the code that follows the marker
        
        pl.when(
            in_column('glob', drg_marks, b = False) & ~other_marks_found
        ).then(
            col('code').str.extract(r'\b(?:MS|MSD)?(?:-|\s)?(?:DRG)?(\d{1,3})\b', 1)
        ).otherwise(None).alias('drg_extracted_')

    ]).with_column(
        col('drg_extracted').fill_null(col('drg_extracted_'))
    ).drop(['drg_extracted_', 'glob'])
    
    return df_

def extract_ndc(df):
    '''
    Extract NDC singlet codes (i.e. not ranges) from multiple columns.
    Searches principally 'code' column, followed by 'internal_revenue_code'
    and 'description'
    
    See: https://en.wikipedia.org/wiki/National_drug_code
    '''
    
    newcolname = 'glob'
    ndc_codes = ['0?\d{4}-\d{4}-\d{2}0?','0?\d{5}-\d{3}-\d{2}0?','0?\d{5}-\d{4}-\d{1}0?']
    
    df_ = df.with_column(
        
        pl
        .concat_str(['code', 'internal_revenue_code', 'description'], '~')
        .alias('glob')
        
    ).with_columns([
        
        # Extract NDC codes from code column when the 
        # code column contains an NDC string
        
        pl.when(
            in_column('code', ndc_codes)
        ).then(
            col('code').str.extract(list_to_regexp(ndc_codes))
        ).otherwise(None).alias('ndc_extracted'),
        
        # Sometimes NDC is marked in the column. 
        # In this case, extract the code that follows the marker
        # Can use more generous searching
        
        pl.when(
            in_column('glob', ['NDC'], b = False)
        ).then(
            col('glob').str.extract(r'([0-9-]{12})', 1)
        ).otherwise(None).alias('ndc_extracted_')

    ]).with_column(
        col('ndc_extracted').fill_null(col('ndc_extracted_'))
    ).drop(['ndc_extracted_', 'glob'])
    
    return df_
    

In [850]:
s = (df
     .pipe(extract_ndc)
     .pipe(extract_cpt)
     .pipe(extract_drg)
    )

In [851]:
g = s.filter(
    col('ndc_extracted').is_null() & 
    col('cpt_extracted').is_null() &
    col('drg_extracted').is_null() 
)

In [338]:
lf = pl.scan_csv('../prices.csv', n_rows = 10_000_000, infer_schema_length = 0, encoding = 'utf8-lossy')

In [339]:
df = lf.collect()

In [8]:
# len(df.filter(~col('drg')).filter(~col('cpt')).filter(~col('ndc')))/len(df)

In [None]:
def classify(df):
    
    def in_glob(strs):
        strs_pat = '|'.join(['\\b' + s + '\\b' for s in strs])
        return col('glob').str.contains(strs_pat)
    
    def in_col(colname, strs):
        strs_pat = '|'.join(['\\b' + s + '\\b' for s in strs])
        return col(colname).str.contains(strs_pat)
    
    # coderange     = ['\d{5}\s?-\s?\d{5}']
    # dig5_code     = ['[^-]\d{5}']
    code_cpt_num  = ['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}','0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}','99[0-4][0-9]{2}']
    code_cpt_alph = ['[^-][A-Z]\d{4}', '[^-]\d{4}[A-Z]']
    ms_drg_strs   = ['MS\d{3}', 'DRG\{3}', 'MS-DRG', 'DRG']
    
    has_code_range = in_glob(['\d{5}\s?-\s?\d{5}'])
    has_5dig_code  = in_glob(['[^-]\d{5}'])
    has_cpt_code   = in_glob(['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}','0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}','99[0-4][0-9]{2}'])
    has_alpha_code = in_glob(['[^-][A-Z]\d{4}', '[^-]\d{4}[A-Z]'])
    has_ndc_code   = in_glob(['\d{4}-\d{4}-\d{2}', '\d{5}-\d{3}-\d{2}', '\d{5}-\d{4}-\d{1}'])
    has_10dig_code = in_glob(['\d{10}'])
    has_ndc_str    = in_glob(['NDC'])
    has_cpt_str    = in_glob(['HCPCS', 'HCPC', 'CPT'])
    has_unit       = in_glob(['VIAL', 'SOLN', 'SOLUTION', 'TBCR', 'TABS', '\d{1,5}\s?/?M?C?[GL]', '\d{1,5}[0]{1,5}U', '\d{4}V',])
    has_msdrg_str  = in_glob(['MS\d{3}', 'DRG\{3}', 'MS-DRG', 'DRG'])
    
    
    ndc_cond  = has_ndc_str | (has_ndc_code & (~has_cpt_str))
    cpt_cond = has_cpt_str | (has_cpt_code & (~ndc_cond)) | in_col('code', code_cpt_alph)
    unit_cond = has_unit

    
    return df.with_columns([
        pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~').alias('glob')
    ]).with_columns([
        pl.when(ndc_cond).then('ndc').otherwise(
            pl.when(cpt_cond | in_col('code', ).then('cpt').otherwise(
                pl.when(unit_cond).then('unit').otherwise(
                    pl.when(in_col('glob', ms_drg_strs)).then('msdrg').otherwise(
                        pl.when(in_col('code', )).then('cpt').otherwise(
                            None
                        )
                    )
                )
            )
        ).alias('classification')
    ])
    

In [None]:
def extract_codes(df):
    '''
    Extract 
    '''
        
    return df.with_columns([
        col('code').str.extract_all(list_to_regexp(proc_codes)).alias('cpt_code'),
        col('code').str.extract_all(list_to_regexp(proc_code_ranges)).alias('cpt_code_range'),
        pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~').str.extract_all(ist_to_regexp(drug_codes)).alias('ndc_code')
    ])
# .explode(['cpt_i', 'ndc'])
    
lf = pl.scan_csv('../prices.csv', n_rows = 1_000_000, infer_schema_length = 0, encoding = 'utf8-lossy')

In [None]:
cpt_code_i_pats = ['0010[0-9]',
                   '001[1-9][0-9]',
                   '00[2-9][0-9]{2}',
                   '0[1-9][0-9]{3}',
                   '[1-8][0-9]{4}',
                   '9[0-8][0-9]{3}',
                   '99[0-4][0-9]{2}']

arr = []
for i, p in enumerate(cpt_code_i_pats):
    arr.extend([f'{p}-{cpt_code_i_pats[j]}' for j in range(i+1, len(cpt_code_i_pats))])

In [109]:
df = lf.pipe(extract_cpt_codes).collect()

In [112]:
df.filter(col('cpt_i_range').is_not_null())

cms_certification_num,payer,code,internal_revenue_code,units,description,inpatient_outpatient,price,code_disambiguator,cpt_i,cpt_i_range,ndc
str,str,str,str,str,str,str,str,str,list[str],list[str],list[str]
"""050014""","""AETNA COMMERCIAL OUT OF NETWORK""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.84""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""AETNA HMO/PPO""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.62""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""ANTHEM COMMERCIAL OUT OF NETWORK""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.60""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""ANTHEM HMO/PPO""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.60""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""BLUE SHIELD COMMERCIAL OUT OF NETWORK""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.90""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""BLUE SHIELD HMO/PPO""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.60""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""BLUE SHIELD INDIVIDUAL""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.60""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""CIGNA COMMERCIAL OUT OF NETWORK""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.90""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""CIGNA HMO/PPO""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""1.90""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",
"""050014""","""GROSS CHARGE""","""NDC 00574-06915""","""250""",,"""GLUCOSE 40% GEL 37.5 ML TUBE""","""INPATIENT""","""2.00""","""IP_RX-81000000-ERX50005630""","[""00574"", ""06915""]","[""00574-06915""]",


In [99]:
len(df.explode(col('cpt_i')))

10790782

In [94]:
import sys
sys.getsizeof(df)

48

In [125]:
d = df.filter(~pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~').str.contains('\\bNDC\\b')).select(pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~'))
s = df.filter(pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~').str.contains('\\bNDC\\b')).select(pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~'))



In [None]:
d

In [168]:
len('00002418230')

11

In [260]:
def classify(df):
    
    def in_glob(strs):
        strs_pat = '|'.join(['\\b' + s + '\\b' for s in strs])
        return col('glob').str.contains(strs_pat)
    
    def in_col(colname, strs):
        strs_pat = '|'.join(['\\b' + s + '\\b' for s in strs])
        return col(colname).str.contains(strs_pat)
    
    # coderange     = ['\d{5}\s?-\s?\d{5}']
    # dig5_code     = ['[^-]\d{5}']
    code_cpt_num  = ['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}','0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}','99[0-4][0-9]{2}']
    code_cpt_alph = ['[^-][A-Z]\d{4}', '[^-]\d{4}[A-Z]']
    ms_drg_strs   = ['MS\d{3}', 'DRG\{3}', 'MS-DRG', 'DRG']
    
    has_code_range = in_glob(['\d{5}\s?-\s?\d{5}'])
    has_5dig_code  = in_glob(['[^-]\d{5}'])
    has_cpt_code   = in_glob(['0010[0-9]','001[1-9][0-9]','00[2-9][0-9]{2}','0[1-9][0-9]{3}','[1-8][0-9]{4}','9[0-8][0-9]{3}','99[0-4][0-9]{2}'])
    has_alpha_code = in_glob(['[^-][A-Z]\d{4}', '[^-]\d{4}[A-Z]'])
    has_ndc_code   = in_glob(['\d{4}-\d{4}-\d{2}', '\d{5}-\d{3}-\d{2}', '\d{5}-\d{4}-\d{1}'])
    has_10dig_code = in_glob(['\d{10}'])
    has_ndc_str    = in_glob(['NDC'])
    has_cpt_str    = in_glob(['HCPCS', 'HCPC', 'CPT'])
    has_unit       = in_glob(['VIAL', 'SOLN', 'SOLUTION', 'TBCR', 'TABS', '\d{1,5}\s?/?M?C?[GL]', '\d{1,5}[0]{1,5}U', '\d{4}V',])
    has_msdrg_str  = in_glob(['MS\d{3}', 'DRG\{3}', 'MS-DRG', 'DRG'])
    
    
    ndc_cond  = has_ndc_str | (has_ndc_code & (~has_cpt_str))
    cpt_cond = has_cpt_str | (has_cpt_code & (~ndc_cond)) | in_col('code', code_cpt_alph)
    unit_cond = has_unit

    
    return df.with_columns([
        pl.concat_str([col('code'), col('description'), col('internal_revenue_code')], '~').alias('glob')
    ]).with_columns([
        pl.when(ndc_cond).then('ndc').otherwise(
            pl.when(cpt_cond | in_col('code', ).then('cpt').otherwise(
                pl.when(unit_cond).then('unit').otherwise(
                    pl.when(in_col('glob', ms_drg_strs)).then('msdrg').otherwise(
                        pl.when(in_col('code', )).then('cpt').otherwise(
                            None
                        )
                    )
                )
            )
        ).alias('classification')
    ])
    

In [261]:
df.filter(~col('code').str.contains('CDM'))[:1000].pipe(classify).filter(col('classification').is_null()).sample(40)

cms_certification_num,payer,code,internal_revenue_code,units,description,inpatient_outpatient,price,code_disambiguator,cpt_i,cpt_i_range,ndc,glob,classification
str,str,str,str,str,str,str,str,str,list[str],list[str],list[str],str,str
"""010001""","""AMERIGROUP MEDICAID [350002]""","""1100000002""","""1100000002""",,"""HC FBC PRIVATE ROOM DAILY""","""UNSPECIFIED""","""1736.00""","""NONE""",,,,"""1100000002~HC FBC PRIVATE ROOM DAILY~1100000002""",
"""010001""","""CARESOURCE [100121]""","""1100000002""","""1100000002""",,"""HC FBC PRIVATE ROOM DAILY""","""UNSPECIFIED""","""1736.00""","""NONE""",,,,"""1100000002~HC FBC PRIVATE ROOM DAILY~1100000002""",
"""010001""","""GEHA [100039]""","""1100000001""","""1100000001""",,"""HC ACUTE CARE PRIVATE ROOM DAILY""","""UNSPECIFIED""","""1736.00""","""NONE""",,,,"""1100000001~HC ACUTE CARE PRIVATE ROOM DAILY~1100000001""",
"""010001""","""CARESOURCE [100121]""","""27200000S1""","""ITM921139""",,"""KIT ARTHROSCOPIC FIXATION 2.4MM DRILL SPEAR CANNULATED OBTURATOR 2.4MM MINI HIP PUSHLOCK DISPOSABLE""","""UNSPECIFIED""","""1100.00""","""NONE""",,,,"""27200000S1~KIT ARTHROSCOPIC FIXATION 2.4MM DRILL SPEAR CANNULATED OBTURATOR 2.4MM MINI HIP PUSHLOCK DISPOSABLE~ITM921139""",
"""010001""","""UNITED HEALTHCARE [100060]""","""1100000001""","""1100000001""",,"""HC ACUTE CARE PRIVATE ROOM DAILY""","""UNSPECIFIED""","""1736.00""","""NONE""",,,,"""1100000001~HC ACUTE CARE PRIVATE ROOM DAILY~1100000001""",
"""010001""","""GEHA [100039]""","""27200000S1""","""ITM921135""",,"""KIT VITRECTOMY VISCOUS FLUID EXTRACTION DISPOSABLE STELLARIS PC STERILE LATEX FREE""","""UNSPECIFIED""","""285.00""","""NONE""",,,,"""27200000S1~KIT VITRECTOMY VISCOUS FLUID EXTRACTION DISPOSABLE STELLARIS PC STERILE LATEX FREE~ITM921135""",
"""010001""","""WELLCARE MEDICARE [450023]""","""27200000S1""","""ITM921070""",,"""PACK VITRECTOMY 25GA BAUSCH + LOMB ELITE POSTERIOR WIDE FIELD STERILE LATEX FREE DISPOSABLE""","""UNSPECIFIED""","""1400.00""","""NONE""",,,,"""27200000S1~PACK VITRECTOMY 25GA BAUSCH + LOMB ELITE POSTERIOR WIDE FIELD STERILE LATEX FREE DISPOSABLE~ITM921070""",
"""010001""","""NAPHCARE [308024]""","""1100000001""","""1100000001""",,"""HC ACUTE CARE PRIVATE ROOM DAILY""","""UNSPECIFIED""","""1736.00""","""NONE""",,,,"""1100000001~HC ACUTE CARE PRIVATE ROOM DAILY~1100000001""",
"""010001""","""BLUE ADVANTAGE [308003]""","""27200000S1""","""ITM921110""",,"""SET BONE BIOPSY 15CM 30D OSTEO-SITE NEEDLE SIDE BEVEL DIAMOND TIP STYLET RADIOLUCENT HANDLE STAINLESS STEEL VERTEBROPLASTY 11GA STERILE DISPOSABLE""","""UNSPECIFIED""","""284.29""","""NONE""",,,,"""27200000S1~SET BONE BIOPSY 15CM 30D OSTEO-SITE NEEDLE SIDE BEVEL DIAMOND TIP STYLET RADIOLUCENT HANDLE STAINLESS STEEL VERTEBROPLASTY 11GA STERILE DISPOSABLE~ITM921110""",
"""010001""","""WELLCARE MEDICARE [450023]""","""27200000S1""","""ITM921135""",,"""KIT VITRECTOMY VISCOUS FLUID EXTRACTION DISPOSABLE STELLARIS PC STERILE LATEX FREE""","""UNSPECIFIED""","""285.00""","""NONE""",,,,"""27200000S1~KIT VITRECTOMY VISCOUS FLUID EXTRACTION DISPOSABLE STELLARIS PC STERILE LATEX FREE~ITM921135""",


In [None]:
def extract_class(df):
    
    def in_column(colnames, items): 
        '''Does the column contain any of the items?'''
        if len(colnames) == 1:
            return col(colnames).str.contains(list_to_regexp(items))
        elif len(colnames) > 1:
            return pl.concat_str(colnames, ' ').str.contains(list_to_regexp(items))
        
    def is_column(colname, items):
        '''Is the column exactly equal to any of the items?'''
        return col(colname).str.strip().str.contains('|'.join(f'^{item}$' for item in items))
    
    def to_extract(items):
        return '|'.join([f'\\b{item}\\b' for item in items])
    
    # Loose matching
    contains_cpt = in_column('code', proc_codes + ['HCPCS', 'HCPC', 'CPT', 'CPT\w{5}', 'HCPCS?\w{5}'])
    contains_cpt_range = in_column('code', proc_code_ranges) & (~in_column('code', ['NDC']))
    
#     contains_ndc = in_column(['code', 'description', 'internal_revenue_code'], ndc_codes + ['NDC'])
#     contains_drg = in_column(['code', 'internal_revenue_code', 'description'], drg_codes)
#     contains_cdm = in_column('code', ['CDM\d+'])
    
#     # Exact matching
#     # If the column `code` is exactly equal to a 5-digit alphanumeric, then it's also a CPT
#     equals_cpt = is_column(['code', 'internal_revenue_code'], proc_codes + ['[A-Z]\d{4}', '\d{4}[A-Z]'])
#     equals_drg = is_column('code', ['\d{1,3}']) # iffy
#     equals_ndc = is_column('code', ['(0+)?\d{10}']) # iffy
    
#     cpt_cond = contains_cpt & (~contains_cpt_range) & (~contains_ndc)
#     cpt_ndc_cond = contains_cpt & (~contains_cpt_range) & contains_ndc
#     ndc_cond = contains_ndc
#     cpt_range_cond = contains_cpt_range
#     drg_cond = contains_drg
    
    # # Equals CDM
    # df_ = df.with_columns([
    #     # pl.when((contains_cpt & (~contains_cpt_range)) | equals_cpt).then(True).otherwise(False).alias('cpt'),
    #     pl.when(contains_cpt_range).then(True).otherwise(False).alias('cpt_range'),
    #     # pl.when(contains_ndc | equals_ndc).then(True).otherwise(False).alias('ndc'),
    #     # pl.when(contains_drg | equals_drg).then(True).otherwise(False).alias('drg'),
    #     # pl.when(contains_cdm).then(True).otherwise(False).alias('cdm')
    # ]).filter(col('cpt_range'))
    
    df_ = df.with_columns([
        pl.when(cpt_cond).then(col('code').str.extract_all(to_extract(proc_codes))).otherwise(None).alias('cpt_codes')
    ])
    
    return df_