In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210119.pkl')

In [3]:
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
stopWords.extend(['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
stopWords = set(stopWords)

def remove_stopwords(diagnosis_list):
    return [d for d in diagnosis_list if d not in stopWords]

def cleanup_raw_diagnoses(series):
    series = series.str.strip()
    
    cleaned = []
    regex_split = re.compile(r'[\|/;|,]')
    regex_sub1 = re.compile(r"[\|/\.-]+")
    for ix,diagnosis in enumerate(series):
        if pd.isna(diagnosis):
            diagnosis = ''
        
        diagnosis = diagnosis.replace('\\',' ')
        diagnosis = diagnosis.replace("'",' ')
        diagnosis_list = regex_split.split(diagnosis)
        diagnosis_list = [d.strip() for d in diagnosis_list]
        diagnosis_list = remove_stopwords(diagnosis_list)
        
        diagnosis = ' '.join(diagnosis_list)
        diagnosis = regex_sub1.sub(' ', diagnosis)
        cleaned.append(diagnosis)
        
    return cleaned

In [4]:
cleaned = cleanup_raw_diagnoses(df['DIAGNOSIS'])

In [5]:
from nltk import word_tokenize

def diagnosis_tokenizer(diagnosis):
    return word_tokenize(diagnosis)

In [6]:
tokens_list = []
for diagn in cleaned:
    tokens_list.extend(diagnosis_tokenizer(diagn))

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('count', CountVectorizer(max_features=3000)),
                 ('tfid', TfidfTransformer(sublinear_tf=True))])

pipe.fit(tokens_list)

Pipeline(steps=[('count', CountVectorizer(max_features=3000)),
                ('tfid', TfidfTransformer(sublinear_tf=True))])

In [8]:
tfidf_names=pipe.named_steps['count'].get_feature_names()

In [9]:
pipe['tfid'].idf_

array([11.69512168, 11.98280375, 11.98280375, ..., 12.38826886,
       11.98280375, 11.0019745 ])

In [11]:
tfidf_vals = pipe['tfid'].idf_
sorted_tfids = sorted(zip(tfidf_names,tfidf_vals),key=lambda x: -x[1])

In [12]:
vec = CountVectorizer(max_features=3000)
vec_fit = vec.fit_transform(tokens_list)

In [13]:
count_names = vec.get_feature_names();    
count_vals = np.asarray(vec_fit.sum(axis=0))[0]
sorted_counts = sorted(zip(count_names,count_vals),key=lambda x: -x[1])

In [None]:
from bokeh.plotting import ColumnDataSource, figure, output_notebook,show
from bokeh.layouts import row, column
from bokeh.models import CustomJS, Slider
import numpy as np
output_notebook()

In [None]:
i=7
ix1 = 0
ix2 = 100

foo = [w[0] for w in sorted_counts[ix1:ix2]]
bar = [c[1] for c in sorted_counts[ix1:ix2]]

p1 = figure(x_range=foo,background_fill_color="#fafafa")
p1.vbar(x=foo, top=bar, width=0.9)
p1.xaxis.major_label_orientation = 'vertical'

baz = [w[0] for w in sorted_tfids[ix1:ix2]]
bam = [c[1] for c in sorted_tfids[ix1:ix2]]

p2 = figure(x_range=baz,background_fill_color="#fafafa")
p2.vbar(x=baz, top=bam, width=0.9)
p2.xaxis.major_label_orientation = 'vertical'

show(row(p1,p2))

In [None]:
print(foo)
print(baz)

In [None]:
df.columns

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from customTransformers import ColumnSelectTransformer, DiagnosisFrameTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

count_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('rfc', RandomForestClassifier())
])

tfidf_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', RandomForestClassifier())
])

params = {
    'rfc__criterion': ['gini','entropy'],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    'rfc__min_samples_split': [2,5,8,11],
    'rfc__min_samples_leaf': [1,4,7],
    'rfc__class_weight': ['balanced', 'balanced_subsample'],
}

In [23]:
gs_regressor = GridSearchCV(count_pipe, params, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
count_est = gs_regressor.fit(df,df['SAMEDAY_ADM_TO_ICU'])

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 16.5min finished


In [15]:
params = {
    'rfc__criterion': ['entropy'],
    'rfc__max_features': ['auto'],
    'rfc__min_samples_split': [2],
    'rfc__min_samples_leaf': [1],
    'rfc__class_weight': ['balanced_subsample'],
}

In [16]:
gs_regressor = GridSearchCV(tfidf_pipe, params, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
tfidf_est = gs_regressor.fit(df,df['SAMEDAY_ADM_TO_ICU'])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   43.2s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.7s finished


In [25]:
count_est.best_params_,count_est.best_score_

({'rfc__class_weight': 'balanced_subsample',
  'rfc__criterion': 'entropy',
  'rfc__max_features': 'auto',
  'rfc__min_samples_leaf': 1,
  'rfc__min_samples_split': 2},
 0.730375594554699)

In [17]:
tfidf_est.best_params_,tfidf_est.best_score_

({'rfc__class_weight': 'balanced_subsample',
  'rfc__criterion': 'entropy',
  'rfc__max_features': 'auto',
  'rfc__min_samples_leaf': 1,
  'rfc__min_samples_split': 2},
 0.7362965392816139)

In [18]:
from sklearn.metrics import confusion_matrix
y_pred = tfidf_est.predict(df)

In [19]:
confusion_matrix(df['SAMEDAY_ADM_TO_ICU'],y_pred)

array([[10847,  3264],
       [ 8951, 37908]])