This notebook uses the EPILEPSY_FEATURES conda environment  
  
`conda env create -f epilepsy_features.yml`  
  
`conda activate epilepsy_features`  

In [1]:
import polars as pl

In [2]:
icds = pl.read_parquet('data/icds.parquet')
meds = pl.read_parquet('data/meds.parquet')
notes = pl.read_parquet('data/notes.parquet')
demo = pl.read_parquet('data/demo.parquet')

## generate features

### preprocess


#### icd

In [None]:
icds = icds.with_columns(
    pl.col('ShiftedContactDTS').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%S%.f')
)

ers = r'^G40|^345|^Q04\.?3|^R56\.?9|^742\.?4'
cs = r'^R56'
syn = r'^R55|^780\.?2'

icds = icds.select(
    pl.when(pl.col('ICDCD').str.contains(ers))
    .then(pl.lit("epilepsy and recurrent seizures"))
    .when(pl.col('ICDCD').str.contains(cs))
    .then(pl.lit("convulsions/seizures"))
    .when(pl.col('ICDCD').str.contains(syn))
    .then(pl.lit('syncope'))
    .when(pl.col('ICDCD').str.contains(r"^780\.?39"))
    .then(pl.lit("epilepsy and recurrent seizures + convulsions/seizures"))
    .otherwise(pl.lit("other"))
    .alias("ICD_GROUP"),
    pl.col('ICDCD'),
    pl.col('BDSPPatientID'),
    pl.col('ShiftedContactDTS')
)

icd = icds.select(
    pl.col('BDSPPatientID').cast(pl.Int64).alias('bdsp_patient_id'),
    (pl.col('ShiftedContactDTS') - pl.duration(days=1)).alias('date_lower'),
    (pl.col('ShiftedContactDTS') + pl.duration(days=1)).alias('date_upper'),
    pl.col('ICD_GROUP')
)

#### meds

In [4]:
meds = meds.filter(pl.col('OrderStatusDSC')!='Canceled')
# meds = meds.with_columns(
#     pl.col('OrderStartDTS').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%S%.f'), 
#     pl.col('OrderEndDTS').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%S%.f'), 
#     pl.col('OrderDTS').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%S%.f'),
#     pl.col('OrderDiscontinuedDTS').str.strptime(pl.Date, format='%Y-%m-%d %H:%M:%S%.f')
#)

asms = ['levetiracetam', 'keppra', 'lacosamide', 'vimpat', 'phenobarbital', 'phenytoin', 'dilantin', 'valproic acid', 'valproate', 'depakote', 'depakene', 'zonisamide', 'zonegran', 'perampanel', 'fycompa', 'clobazam', 'onfi', 'clonazepam', 'klonopin', 'diazepam', 'valium', 'lorazepam', 'ativan', 'oxcarbazepine', 'trileptal', 'oxtellar', 'carbamazepine', 'tegretol', 'eslicarbazepine', 'aptiom', 'gabapentin', 'neurontin', 'pregabalin', 'lyrica', 'brivaracetam', 'briviact', 'cannabidiol', 'epidiolex', 'cenobamate', 'xcopri', 'lamotrigine', 'lamictal', 'fosphenytoin', 'propofol', 'midazolam', 'nayzilam', 'ketamine', 'pentobarbital', 'acetazolamide', 'acth', 'epitol', 'equetro', 'carbatrol', 'frisium', 'sympazan', 'epitril', 'rivotril', 'clorazepate', 'tranxene', 'gen xene', 'diamox', 'diastat', 'divalproex sodium', 'ethosuximide', 'zarontin', 'ethotoin', 'ezogabine', 'potiga', 'felbamate', 'felbatol', 'gralise', 'horizant', 'roweepra', 'spritam', 'elepsia xr', 'methsuximide', 'methosuximide', 'celontin', 'luminol', 'luminal', 'epanutin', 'phenytek', 'primidone', 'mysoline', 'rufinamide', 'banzel', 'inovelon', 'stiripentol', 'diacomit', 'tiagabine', 'gabitril', 'topiramate', 'topamax', 'qudexy xr', 'trokendi xr', 'convulex', 'depacon', 'orfiril', 'valporal', 'valprosid', 'vigabatrin', 'sabril', 'vigadrone']

# Convert 'med' column to lowercase
meds = meds.with_columns(pl.col('MedicationDSC').str.to_lowercase())

# Filter meds DataFrame
meds = meds.filter(pl.col('MedicationDSC').str.contains('|'.join(asms)))

def group_meds(meds: pl.DataFrame, col: str) -> pl.DataFrame:
    # Define medication mappings
    med_mappings = {
        r'phentermine.*topiramate|topiramate.*phentermine|qsymia': 'phentermine/topiramate',
        r'phenobarbital.*propantheline|propantheline.*phenobarbital': 'propantheline/phenobarbital',
        'acetazolamide|diamox': 'acetazolamide',
        'eslicarbazepine|aptiom': 'eslicarbazepine',
        'lorazepam|ativan': 'lorazepam',
        'rufinamide|banzel': 'rufinamide',
        'brivaracetam|briviact': 'brivaracetam',
        'cannabidiol|epidiolex': 'cannabidiol',
        'carbamazepine|carbatrol|epitol|equetro|tegretol': 'carbamazepine',
        'methsuximide|celontin': 'methsuximide',
        'cenobamate|xcopri': 'cenobamate',
        'clobazam|onfi|sympazan': 'clobazam',
        'clonazepam|klonopin': 'clonazepam',
        'clorazepate|tranxene': 'clorazepate',
        'valproic acid|depacon|depakene|depakote|divalproex|stavzor|valproate': 'valproic acid',
        'diazepam|diastat|valium|valtoco': 'diazepam',
        'fosphenytoin': 'fosphenytoin',
        'mephenytoin': 'mephenytoin',
        'phenytoin|dilantin|phenytek': 'phenytoin',
        'levetiracetam|elepsia|keppra|roweepra|spritam': 'levetiracetam',
        'esketamine|spravato': 'esketamine',
        'ethosuximide|zarontin': 'ethosuximide',
        'ezogabine|potiga': 'ezogabine',
        'gabapentin|fanatrex|gabacaine|gralise|horizant|neurontin|smartrx': 'gabapentin',
        'felbamate|felbatol': 'felbamate',
        'perampanel|fycompa': 'perampanel',
        'tiagabine|gabitril': 'tiagabine',
        'midazolam|nayzilam': 'midazolam',
        'ketamine|ketalar': 'ketamine',
        'lacosamide|vimpat': 'lacosamide',
        'lamotrigine|lamictal|subvenite': 'lamotrigine',
        'phenobarbital|luminal': 'phenobarbital',
        'pregabalin|lyrica': 'pregabalin',
        'primidone|mysoline': 'primidone',
        'oxcarbazepine|oxtellar|trileptal': 'oxcarbazepine',
        'ethotoin|peganone': 'ethotoin',
        'phenobarbital|pentobarbital': 'phenobarbital',
        'propofol': 'propofol',
        'topiramate|qudexy|topamax|topiragen|trokendi': 'topiramate',
        'vigabatrin|sabril': 'vigabatrin',
        'zonisamide|zonegran': 'zonisamide'
    }

    # Convert column to lowercase
    meds = meds.with_columns(pl.col(col).str.to_lowercase())

    # Apply mappings
    for pattern, replacement in med_mappings.items():
        mask = meds[col].str.contains(pattern)
        
        # Handle special cases
        if replacement == 'ketamine':
            mask = mask & (meds[col] != 'esketamine')
        elif replacement == 'phenytoin':
            mask = mask & ~meds[col].is_in(['mephenytoin', 'fosphenytoin'])
        
        meds = meds.with_columns(
            pl.when(mask)
            .then(pl.lit(replacement))
            .otherwise(pl.col(col))
            .alias(col)
        )

    return meds
    
meds = group_meds(meds, 'MedicationDSC')


In [5]:
# Prepare meds dataframe
meds = meds.with_columns(
    pl.col('BDSPPatientID').cast(pl.Int64)
).rename({'BDSPPatientID':'bdsp_patient_id'})


#### notes

In [6]:
notes = notes.with_columns(
    pl.col('bdsp_patient_id').cast(pl.Int64),
    pl.col('date_note').cast(pl.Date)
)
notes = notes.with_columns(
    pl.col('note').str.replace_all(r'[^a-zA-Z0-9 \n\.]', '')
    .str.replace_all(r'\s+', ' ')
    .str.strip_chars()
    .str.to_lowercase()
)
notes = notes.group_by(['bdsp_patient_id', 'date_note']).agg(
    pl.col('*').exclude('note'),
    pl.col('note').str.concat(delimiter=' ').alias('note')
)

### combine

#### icd

In [7]:
joined = notes.join(
    icd,
    on='bdsp_patient_id',
    how='left'
).filter(
    (pl.col('date_note') >= pl.col('date_lower')) &
    (pl.col('date_note') <= pl.col('date_upper'))
)

result = joined.with_columns([
    pl.when(pl.col('ICD_GROUP').str.contains('epilepsy and recurrent seizures'))
    .then(1).otherwise(0).alias('epilepsy and recurrent seizures'),
    
    pl.when(pl.col('ICD_GROUP').str.contains('convulsions/seizures'))
    .then(1).otherwise(0).alias('convulsions seizures'),

    pl.when(pl.col('ICD_GROUP').str.contains('syncope'))
    .then(1).otherwise(0).alias('syncope')
])

notes = notes.join(
    result.group_by('bdsp_patient_id', 'date_note').agg([
        pl.col('epilepsy and recurrent seizures').max(),
        pl.col('convulsions seizures').max(),
        pl.col('syncope').max()
    ]),
    on=['bdsp_patient_id', 'date_note'],
    how='left'
).with_columns([
    pl.col('epilepsy and recurrent seizures').fill_null(0),
    pl.col('convulsions seizures').fill_null(0),
    pl.col('syncope').fill_null(0),
    (pl.col('epilepsy and recurrent seizures') + pl.col('convulsions seizures')).fill_null(0).alias('n_icds') # do NOT use syncope in n_icds
])

#### meds

In [8]:
# Join meds with notes and filter
df_with_meds = notes.join(
    meds,
    on='bdsp_patient_id',
    how='left'
).filter(
    (pl.col('date_note') >= pl.col('OrderStartDTS')) & 
    ((pl.col('OrderEndDTS').is_null()) | (pl.col('date_note') <= pl.col('OrderEndDTS')))
)

df_with_meds = df_with_meds.to_dummies('MedicationDSC')
df_with_meds = df_with_meds.group_by('bdsp_patient_id', 'date_note').agg(
    [pl.col(x).max().alias(x.replace('MedicationDSC_', '')) for x in df_with_meds.columns if x.startswith('MedicationDSC_')]
).with_columns(
    pl.sum_horizontal(pl.col('*').exclude('bdsp_patient_id', 'date_note')).alias('n_meds')
)
notes = notes.join(df_with_meds, on=['bdsp_patient_id', 'date_note'], how='left').fill_null(0)
notes = notes.with_columns([
    pl.lit(0).alias(x) for x in asms if x not in notes.columns
]).rename({
    'acetazolamide':'Acetazolamide',
    'brivaracetam':'Brivaracetam',
    'cannabidiol':'Cannabidiol',
    'carbamazepine':'carbamezapine'
})



#### notes

In [9]:
antiEpilepsyBagOfWords = {'evid': {'not', 'evid', 'diagnosi', 'epilepsi'},
                        'recommend': {'not', 'recommend', 'antiepilept', 'medic'},
                        'defer sz': {'defer', 'anti', 'seizur'},
                        'defer med': {'defer', 'anti', 'epilept'},
                        'refer': {'referr', 'gener', 'neurolog'},
                        'follow up': {'not', 'requir', 'follow', 'up'},
                        'followup': {'not', 'requir', 'followup'},
                        'cannot': {'cannot', 'event', 'epilept'},
                        'pnes': {'pnes'},
                        'nosz': {'no', 'seizur', 'event'},
                        'unlikely': {'unlik', 'seizur'},
                        'fnd': {'function', 'neurolog', 'disord'},
                        'migraine': {'migrain'},
                        'anxiety': {'anxieti'},
                        'syncope': {'syncop'},
                        'cd': {'convers', 'disord'},
                        'psycho': {'psychogen'},
                        'risk': {'not', 'have', 'seizur', 'risk', 'factor'},
                        'sleep': {'sleep', 'disord'},
                        'apnea': {'sleep', 'apnea'},
                        'test': {'not', 'recommend', 'test'},
                        'suspicion': {'low', 'suspicion', 'seizur'},
                        'tremor': {'physiolog', 'tremor'},
                        '"seizures"': {"''", 'seizur'},
                        'fn': {'function', 'neurolog'},
                        'vasovagal': {'vasovag'},
                        'pcp': {'defer', 'primary', 'care', 'physician'},
                        'definition': {'not', 'meet', 'definit', 'epilepsi'},
                        'support': {'not', 'support', 'diagnosi', 'epilepsi'},
                        'amnesia': {'amnesia'},
                        'provoke': {'provok', 'seizur'},
                        'depression': {'dispress'},
                        'shiver': {'shiver'},
                        'arrest': {'cardiac', 'arrest'},
                        'noanti': {'no', 'anti', 'seizur', 'medic'},
                        'neuropathy': {'neuropathi'},
                        'neuropathic': {'neuropath'},
                        'meningioma': {'me ningioma'},
                        'holdoff': {'hold', 'off', 'start', 'anti', 'epilept'},
                        'diabetes': {'diabet'},
                        'neurosarcoidosis': {'neurosarcoidosi'},
                        'sdh': {'sdh'},
                        'postoper': {'post', 'oper'},
                        'hemorrhage': {'traumat', 'hemorrhag'},
                        'concern': {'low', 'concern', 'seizur'},
                        'noconcern': {'no', 'concern', 'seizur'},
                        'convince': {'not', 'convinc', 'seizur'},
                        'follow': {'not', 'need', 'follow', 'epilepsi'},
                        'notfollowup': {'not', 'need', 'followup'},
                        'start': {'not', 'start', 'antiepilept', 'medic'},
                        'startsz': {'not', 'start', 'antiseizur', 'medic'},
                        'cause': {'unlik', 'epilepsi'},
                        'trauma': {'trauma'},
                        'traumatic': {'traumat'},
                        'hematoma': {'hematoma'},
                        'abscess': {'brain', 'abscess'},
                        'hold': {'hold', 'off', 'medic'},
                        'postop': {'postop'},
                        'single': {'singl', 'seizur'},
                        'singlesz': {'singl', 'sz'},
                        'funcevents': {'function', 'event'},
                        'asneeded': {'follow', 'up', 'as', 'need'},
                        'asneededfollow': {'followup', 'as', 'need'},
                        'referpsy': {'referr', 'psychiatri'},
                        'defermed': {'defer', 'medic'},
                        'acute': {'acut', 'symptomat', 'seizur'},
                        'symptomatic': {'symptomat', 'seizur'},
                        'first': {'first', 'time', 'seizur'},
                        'lifetime': {'one', 'lifetim', 'seizur'},
                        'evidence': {'no', 'evid', 'seizur'},                          
                        'meet': {'not', 'meet', 'epilepsi'},
                        'notneedmedic': {'not', 'need', 'medic'},
                        'jacobsen': {'jacobsen', 'syndrom'},
                        'alcohol': {'excess', 'alcohol'},
                        'exam': {'normal', 'neurolog', 'exam'},
                        'mri': {'normal', 'mri'},
                        'eeg':{'normal', 'eeg'},
                        'eprisk': {'no', 'epilepsi', 'risk'},
                        'factors': {'no', 'epilepsi', 'risk', 'factor'},
                        'epileptiform': {'no', 'epileptiform', 'abnorm'},
                        'psychiatric': {'psychiatr'},
                        'fentanyl': {'fentanyl'},
                        'bipolar': {'bipolar'},
                        'not have': {'not', 'have', 'epilepsi'},
                        'bite': {'no', 'bite'},
                        'incontinence': {'no', 'incontin'},
                        'lowthres': {'low', 'seizur', 'threshold'},
                        'lowerthres': {'lower', 'seizur', 'threshold'},
                        'antisz': {'no', 'antiseizur', 'medic'},
                        'had': {'not', 'had', 'seizur'},
                        'nonepileptic': {'nonepilept'},
                        'chemo': {'chemo'},
                        'chemotherapy': {'chemotherapi'},
                        'epileptogenic': {'no', 'epileptogen', 'abnorm'},
                        'numb': {'numb'},
                        'surgery': {'surgeri'},
                        'discharge': {'discharg', 'epilepsi', 'clinic'},
                        'nonepileptiform': {'nonepileptiform'},
                        'non epileptiform': {'non', 'epileptiform'},
                        'not epileptic': {'not', 'epilept'},
                        'dementia': {'dementia'},
                        'think': {'not', 'think', 'epilepsi'},
                        'diagnose': {'no', 'diagnosi', 'epilepsi'},
                        'tingling': {'tingl'},
                        'activity': {'not', 'epileptiform', 'activ'},
                        'noseizure': {'no', 'seizur'},
                        'withdrawal': {'withdraw', 'seizur'},
                        'dizzy': {'dizzi'},
                        'maintain': {'maintain', 'conscious'},
                        'electrograph': {'no', 'electrograph', 'seizur'},
                        'wean': {'wean', 'off'},
                        'taper': {'taper'},
                        'resect': {'resect'},
                        'second': {'second', 'opinion'},
                        'definite': {'definit', 'diagnosi', 'epilepsi'},
                        'pseudoseizure': {'pseudoseizur'},
                        'cardiology': {'cardiolog'},
                        'againstsz': {'against', 'seizur'},
                        'against': {'against', 'epilepsi'},
                        'ptsd': {'ptsd'},
                        'pneslong': {'psychogen', 'nonepilept', 'seizur'},
                        'presyncope': {'presyncop'},
                        'hypoglycemia': {'hypoglycemia'},
                        'doubt': {'doubt', 'seizur'},
                        'carry': {'not', 'carri', 'diagnosi', 'epilepsi'},
                        'acutesz': {'acut', 'seizur'},
                        'deny': {'deni', 'seizur'},
                        'spell': {'provok', 'spell'},
                        'non epileptic': {'non', 'epilept', 'spell'},
                        'non  epileptic': {'nonepilept', 'spell'},
                        'insomnia': {'insomnia'},
                        'migraine aura': {'migrain', 'aura'},
                        'clinical': {'no', 'clinic', 'seizur'},
                        'criteria': {'not', 'criteria', 'epilepsi'}}

proEvidences = {'both': {'both', 'epilepsi', 'pnes'},
                'mixed dis': {'mix', 'disord'},
                'ictal': {'ictal'},
            'aura': {'aura'},
            'convulse': {'convuls'},
            'breakthrough': {'breakthrough', 'seizur'},
            'focal': {'focal'},
            'idiopathic': {'idiopath', 'general', 'epilepsi'},
            'history': {'histori', 'seizur'},
            'hx': {'hx', 'seizur'},
            'complex': {'complex', 'seizur'},
            'partial': {'partial', 'seizur'},
            'myoclonic': {'myoclon'},
            'generalized': {'general', 'seizur'},
            'continue': {'continu', 'on'},
            'drive': {'drive', 'month'},
            'szdrive': {'drive', 'seizur'},
            'deja': {'deja', 'vu'},
            'seizurefree': {'seizurefre'},
            'szfree': {'szfree'},
            'seizure free': {'seizur', 'free'},
            'sz free':{'sz', 'free'},
            'frontallobe': {'frontal', 'lobe'},
            'nocturnal': {'nocturn'},
            'febrile': {'febril'},
            'perinatal': {'perinat', 'complic'},
            'control': {'seizur','control'}, 
            'monotherapy': {'monotherapi'},
            'absence': {'absenc', 'seizur'},
            'dejavu': {'dejavu'},
            'postictal': {'postict', 'confus'},
            'tonicclonic': {'tonniclon'},
            'tonic clonic': {'tonic', 'clonic'}, 
            'sudden': {'sudden', 'unexpect', 'death'},
            'sudep': {'sudep'},
            'droop': {'facial', 'droop'},
            'intractable': {'intract', 'epilepsi'},
            'daily': {'daili', 'seizur'},
            'decreased': {'decreas', 'seizur'},
            'device': {'devic'},
            'surgical': {'surgic', 'intervent'},
            'reprogram': {'reprogram'},
            'abnormaleeg': {'abnorm', 'eeg'},
            'with': {'with', 'epilepsi'},
            'juvenile': {'juvenil', 'epilespi'},
            'myoclonus': {'myoclonus'},
            'recurrent': {'recurr', 'sz'},
            'recurrents': {'recurr', 'seizur'},
            'noncompliance': {'noncompli'},
            # 'szdisorder': {'seizur', 'disord'},
            'stable': {'seizur', 'stabl'},
            'shoulder': {'disloc', 'shoulder'},
            'narcolepsy': {'narcolepsi'},
            'sleep clinic': {'sleep', 'clinic'}}
            
aeds = ['acetazolamid', 'acth',
        'acthar', 'brivaracetam',
        'briviact', 'cannabidiol' , 'epidiolex',
        'carbamazepin', 'cbz', 'epitol', 'tegretol', 'equetro', 'teril',
        'carbatrol', 'tegretol', 'epitol', 'cenobam', 'xcopri',
        'clobazam', 'frisium', 'onfi', 'sympazan', 'clonazepam',
        'epitril', 'klonopin', 'rivotril', 'clorazep', 'tranxen',
        'xene', 'diazepam', 'valium' , 'diamox',
        'diastat', 'divalproex', 'depakot', 'eslicarbazepin', 'aptiom',
        'ethosuximid', 'zarontin', 'ethotoin', 'ezogabin', 'potiga',
        'felbam', 'felbatol', 'gabapentin', 'neurontin', 'gralis',
        'horiz', 'lacosamid', 'vimpat', 'lamotrigin', 'lamict',
        'levetiracetam', 'ltg', 'ige', 'tpm', 'oxc', 'lev', 'keppra', 'roweepra', 'spritam',
        'elepsia', 'lorazepam', 'ativan', 'methsuximid', 'methosuximid',
        'celontin', 'oxcarbazepin', 'trilept', 'oxtellar xr', 'perampanel',
        'fycompa', 'phenobarbit', 'luminol', 'lumin', 'phenytoin',
        'epanutin', 'dilantin', 'phenytek', 'pregabalin', 'lyrica',
        'primidon', 'mysolin', 'rufinamid', 'banzel', 'inovelon', 'percocet',
        'stiripentol', 'diacomit', 'tiagabin', 'gabitril', 'topiram', 'topamax',
        'topiram',  'qudexi', 'trokendi', 'valproat', 'valproic', 'wellbutrin',
        'convulex', 'depacon', 'depaken', 'orfiril', 'valpor', 'valprosid',
        'depakot', 'vigabatrin', 'sabril', 'vigadron', 'zonisamid', 'zonegran', 'xanax', 'cocaine']

In [10]:
all_features = list(antiEpilepsyBagOfWords.keys()) + list(proEvidences.keys()) + aeds

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import SnowballStemmer
import nltk
import ray

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

stemmer = SnowballStemmer('english')
stemmer_r = ray.put(stemmer)


@ray.remote
def process_note(text, stemmer_r):
    feature_vector = dict.fromkeys(all_features, 0)
    sentences = sent_tokenize(text)
    for s in sentences:
        stem_words = set(stemmer_r.stem(word) for word in word_tokenize(s))

        # Check for AEDs
        for word in stem_words.intersection(aeds):
            feature_vector[word] = 1

        # Check for anti-epilepsy bag of words and pro-evidences
        for dictionary in (antiEpilepsyBagOfWords, proEvidences):
            for bag, words in dictionary.items():
                if words.issubset(stem_words):
                    feature_vector[bag] = 1
    return feature_vector

columns_to_join = [
        ('history', 'hx'),
        ('follow up', 'followup'),
        ('sz free', 'szfree'),
        ('seizure free', 'seizurefree'),
        ('sz free', 'seizure free'),
        ('carbamazepin', 'cbz'),
        ('lamotrigin', 'ltg'),
        ('levetiracetam', 'lev'),
        ('oxcarbazepin', 'oxc'),
        ('topamax', 'tpm'),
        ('lowthres', 'lowerthres'),
        ('chemo', 'chemotherapy'),
        ('epileptiform', 'epileptogenic'),
        ('deja', 'dejavu'),
        ('tonic clonic', 'tonicclonic'),
        ('nonepileptiform', 'non epileptiform'),
        ('asneeded', 'asneededfollow'), 
        ('recurrent', 'recurrents'),
        ('single', 'singlesz'),
        ('non epileptic', 'non  epileptic')
    ]

2024-09-17 14:43:42,216	INFO worker.py:1786 -- Started a local Ray instance.


In [12]:
fm = ray.get([process_note.remote(note, stemmer_r) for note in notes['note']])
fm = pl.DataFrame(fm)

# Join like columns

for col1, col2 in columns_to_join:
    if col1 in fm.columns and col2 in fm.columns:
        fm = fm.with_columns(
            pl.max_horizontal(col1, col2).alias(col1)
        ).drop(col2)

fm = fm.rename({col: f"{col}_" for col in fm.columns})


notes = notes.hstack(fm)

In [13]:
ray.shutdown()

#### demo

In [14]:
notes = notes.join(demo.select(['bdsp_patient_id', 'DateOfBirth', 'SexDSC']).unique(), on='bdsp_patient_id', how='left').select(
    pl.col('*').exclude('DateOfBirth', 'SexDSC'),
    Age=pl.col('date_note').dt.year() - pl.col('DateOfBirth').dt.year(),
    Sex=pl.when(pl.col('SexDSC') == 'Male').then(1).otherwise(0)
)

#### normalizing columns

In [15]:
# these hard coded values are extracted from the original model's usage of X_train.csv
cols = ['evid_','recommend_','defer sz_','follow up_','pnes_','nosz_','unlikely_','fnd_','migraine_','anxiety_','syncope_','cd_','psycho_','risk_','sleep_','apnea_','test_','suspicion_','tremor_','fn_','vasovagal_','definition_','support_','amnesia_','provoke_','shiver_','arrest_','noanti_','neuropathy_','neuropathic_','meningioma_','holdoff_','diabetes_','neurosarcoidosis_','sdh_','postoper_','hemorrhage_','concern_','noconcern_','convince_','follow_','notfollowup_','start_','startsz_','cause_','trauma_','traumatic_','hematoma_','abscess_','hold_','postop_','single_','funcevents_','asneeded_','referpsy_','defermed_','acute_','symptomatic_','first_','lifetime_','evidence_','meet_','notneedmedic_','jacobsen_','alcohol_','exam_','mri_','eeg_','eprisk_','factors_','epileptiform_','psychiatric_','fentanyl_','bipolar_','not have_','bite_','incontinence_','lowthres_','antisz_','had_','nonepileptic_','chemo_','numb_','surgery_','discharge_','nonepileptiform_','not epileptic_','dementia_','think_','diagnose_','tingling_','activity_','noseizure_','withdrawal_','dizzy_','maintain_','electrograph_','wean_','taper_','resect_','second_','definite_','pseudoseizure_','cardiology_','againstsz_','against_','ptsd_','pneslong_','presyncope_','hypoglycemia_','doubt_','carry_','acutesz_','deny_','spell_','non epileptic_','insomnia_','migraine aura_','clinical_','criteria_','both_','mixed dis_','ictal_','aura_','convulse_','breakthrough_','focal_','idiopathic_','history_','complex_','partial_','myoclonic_','generalized_','continue_','drive_','szdrive_','deja_','sz free_','frontallobe_','nocturnal_','febrile_','perinatal_','control_','monotherapy_','absence_','postictal_','tonic clonic_','sudden_','sudep_','droop_','intractable_','daily_','decreased_','device_','surgical_','reprogram_','abnormaleeg_','with_','myoclonus_','recurrent_','noncompliance_','stable_','shoulder_','narcolepsy_','sleep clinic_','acetazolamid_','acth_','acthar_','brivaracetam_','briviact_','cannabidiol_','epidiolex_','carbamazepin_','epitol_','tegretol_','carbatrol_','cenobam_','xcopri_','clobazam_','frisium_','onfi_','clonazepam_','klonopin_','rivotril_','clorazep_','tranxen_','diazepam_','valium_','diamox_','diastat_','divalproex_','depakot_','eslicarbazepin_','aptiom_','ethosuximid_','zarontin_','ezogabin_','potiga_','felbam_','felbatol_','gabapentin_','neurontin_','gralis_','lacosamid_','vimpat_','lamotrigin_','lamict_','levetiracetam_','ige_','keppra_','lorazepam_','ativan_','methsuximid_','celontin_','oxcarbazepin_','trilept_','perampanel_','fycompa_','phenobarbit_','lumin_','phenytoin_','dilantin_','phenytek_','pregabalin_','lyrica_','primidon_','mysolin_','rufinamid_','banzel_','percocet_','stiripentol_','tiagabin_','gabitril_','topiram_','topamax_','qudexi_','trokendi_','valproat_','valproic_','wellbutrin_','depaken_','vigabatrin_','sabril_','zonisamid_','zonegran_','xanax_','Age','Sex','n_meds','Acetazolamide','Brivaracetam','Cannabidiol','carbamezapine','cenobamate','clobazam','clonazepam','clorazepate','diazepam','eslicarbazepine','ethosuximide','ezogabine','felbamate','gabapentin','ketamine','lacosamide','lamotrigine','levetiracetam','lorazepam','methsuximide','midazolam','oxcarbazepine','perampanel','phenobarbital','phenytoin','pregabalin','primidone','rufinamide','tiagabine','topiramate','valproic acid','zonisamide','convulsions seizures','epilepsy and recurrent seizures','syncope','n_icds']
values = {
    'n_icds': (0, 2),
    'n_meds': (0, 10),
    'Age': (18, 121)
}

In [16]:
def scale_round_normalize(col, min_val, max_val):
    # Scale to min-max range
    scaled = (col - col.min()) / (col.max() - col.min()) * (max_val - min_val) + min_val
    
    # Round the values
    rounded = scaled.round()
    
    # Scale again to [0, 1]
    return (rounded - rounded.min()) / (rounded.max() - rounded.min())

notes = notes.with_columns([
    pl.col(x)
      .cast(pl.Float64)
      .pipe(scale_round_normalize, 
            min_val=values[x][0], 
            max_val=values[x][1])
for x in values.keys()
])

In [17]:
notes

bdsp_patient_id,date_note,filename,type_note,note_source,note,epilepsy and recurrent seizures,convulsions seizures,syncope,n_icds,Acetazolamide,Brivaracetam,Cannabidiol,carbamezapine,cenobamate,clobazam,clonazepam,diazepam,eslicarbazepine,ethosuximide,fosphenytoin,gabapentin,ketamine,lacosamide,lamotrigine,levetiracetam,lorazepam,midazolam,oxcarbazepine,perampanel,phenobarbital,phenytoin,pregabalin,primidone,propofol,rufinamide,topiramate,…,epanutin_,dilantin_,phenytek_,pregabalin_,lyrica_,primidon_,mysolin_,rufinamid_,banzel_,inovelon_,percocet_,stiripentol_,diacomit_,tiagabin_,gabitril_,topiram_,topamax_,qudexi_,trokendi_,valproat_,valproic_,wellbutrin_,convulex_,depacon_,depaken_,orfiril_,valpor_,valprosid_,vigabatrin_,sabril_,vigadron_,zonisamid_,zonegran_,xanax_,cocaine_,Age,Sex
i64,date,list[str],list[str],list[str],str,i32,i32,i32,f64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i32
112481749,2008-02-14,"[""Notes_10155417836_141744777_20080214.txt"", ""Notes_10155417836_141744778_20080214.txt""]","[""HERPES SIMPLEX VIRUS TYPE 1 ISOLATED"", ""RESULT CALLED TO CARE UNIT AND/OR MD""]","[""pre epic"", ""pre epic""]","""microbiology report specimen d…",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.524272,0
113719267,2017-04-10,"[""Notes_13325445292_1781488376_20170410.txt"", ""Notes_13325445292_1781489987_20170410.txt"", … ""Notes_10172127813_148058178_20170410.txt""]","[""Procedures"", ""PHS IP AVS Additional Pt Instructions"", … ""PAT""]","[""first batch"", ""first batch"", … ""pre epic""]","""gastrointestinal endoscopy uni…",0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.601942,0
117433662,2018-02-18,"[""Notes_13406818575_1906566395_20180218.txt""]","[""Telephone Encounter""]","[""first batch""]","""hello 438pm i contacted patien…",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.514563,1
114161908,2017-10-31,"[""Notes_13359631561_1638932436_20171031.txt"", ""Notes_13359631561_1638980963_20171031.txt"", … ""Notes_13359631561_1638989136_20171031.txt""]","[""ED Notes"", ""ED Provider Notes"", … ""ED Notes""]","[""first batch"", ""first batch"", … ""first batch""]","""ed nursing progress note patie…",1,1,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.203883,1
111658738,2016-08-27,"[""Notes_10142181110_148093157_20160827.txt"", ""Notes_10142181110_148093158_20160827.txt""]","[""MRRADCT.TH.CHEST"", ""MRRADCT.AB.ABDPEL""]","[""pre epic"", ""pre epic""]","""ct scan of the chest with intr…",0,0,0,0.0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.68932,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
118494422,2017-10-18,"[""Notes_13277220282_1380664169_20171018.txt""]","[""Telephone Encounter""]","[""first batch""]","""inr date value ref range statu…",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.854369,0
122316358,2012-05-08,"[""Notes_10184918974_134607637_20120508.txt"", ""Notes_10184918974_139109730_20120508.txt""]","["" LMR Note"", "" LMR Note""]","[""pre epic"", ""pre epic""]","""poct ua result poct color refe…",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.330097,0
117856494,2013-05-23,"[""Notes_10165385548_140624833_20130523.txt""]","["" LMR Note""]","[""pre epic""]","""subject orderscanned""",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.582524,1
120741680,2020-09-23,"[""Notes_13587011204_6606453381_20200923.txt""]","[""Progress Notes""]","[""first batch""]","""hematologyoncology progress no…",0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.815534,0


In [18]:
notes.select(cols).cast(pl.Float64).write_parquet('features/fm.parquet')
notes.write_parquet('features/fm_all_cols.parquet')