In [4]:
"""
TPOT with word2vec/tensorflow
https://towardsdatascience.com/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac
https://github.com/Madhu009/Deep-math-machine-learning.ai/blob/master/NLP/Word2Vec-largedata.ipynb
"""

'\nTPOT with word2vec/tensorflow\nhttps://towardsdatascience.com/learn-word2vec-by-implementing-it-in-tensorflow-45641adaf2ac\nhttps://github.com/Madhu009/Deep-math-machine-learning.ai/blob/master/NLP/Word2Vec-largedata.ipynb\n'

In [5]:
"""
installs
"""
!pip install tpot
!pip install ipywidgets
!pip install tensorflow-gpu
!pip install flair



In [6]:
"""
imports
"""
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import re
import nltk
import sys
from collections import OrderedDict
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import cudf
from cudf import DataFrame
import flair

In [10]:
"""
global variables
"""
#mimic_file="/rapids/notebooks/hostfs/MIMIC-data/mimic-iii-clinical-database-1.4/mimic-unstructured.txt"
input_file='/rapids/notebooks/hostfs/Pipeline/jason_mimc-554_new.csv'
path=os.getcwd()
data=pd.read_csv(input_file)
cudata=DataFrame.from_pandas(data)

print (data.columns)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'index', 'record_id', 'mass_unit',
       'volume_unit', 'time_unit', 'route_type', 'form_unit',
       'frequency_indicator', 'measurement_unit', 'measurement',
       'duration_pattern', 'clock', 'drug_cleaned', 'drug', 'diagnosis',
       'admission_date', 'discharge_date', 'temperature', 'blood_pressure',
       'temperature_0', 'temperature_1', 'temperature_2', 'length_of_stay',
       'systolic_0', 'diastolic_0', 'systolic_1', 'diastolic_1', 'systolic_2',
       'diastolic_2', 'los_code', 'identified_entities', 'record_text'],
      dtype='object')


In [7]:
"""
create corpus from record text
"""
mimic_data=open(mimic_file).readlines()
words=[]
corpus_raw=[]
#for row in data['record_text']:
#    corpus_raw.append(str(row))

for word in mimic_data:
    if word != '.': # because we don't want to treat . as a word
        words.append(word)
words = set(words) # so that all duplicate words are removed
print (len(words))

NameError: name 'mimic_file' is not defined

In [11]:
"""
Build cuda dataframe used for analysis
"""

df=pd.DataFrame()
target=[]
#convert string to int
for n in cudata['length_of_stay']:
    n=re.sub("[^0-9]", "", str(n))
    target.append(n)


df['target']=target
df['ind_variables']=data['identified_entities']
df['diagnosis']=data['diagnosis']

In [8]:
"""
drop unecessary columns
"""

##drop unecessary columns
#data.drop('name', inplace=True, axis=1)
#data.drop('purchase_date',inplace=True, axis=1)
#data.drop('address',inplace=True, axis=1)
#data.drop('sys_tmstmp',inplace=True, axis=1)
#data.drop('provider_name',inplace=True, axis=1)
#data.drop('zip_code',inplace=True, axis=1)
print (df.columns)

Index(['target', 'ind_variables', 'diagnosis'], dtype='object')


In [9]:
"""
convert categorical variables and one-hot encode
start with just principal diagnosis
"""

##convert categorical variables and one-hot encode
#data['condition']=pd.Categorical(data['condition'])
df['ind_variables']=pd.Categorical(df['ind_variables'])
df['diagnosis']=pd.Categorical(df['diagnosis'])
df1 = pd.get_dummies(df['ind_variables'], prefix = 'ind_variables')
df2 = pd.get_dummies(df['diagnosis'], prefix = 'diagnosis')
df= pd.concat([df, df1,df2], axis=1)
print (df.columns)

AttributeError: 'bool' object has no attribute 'any'

In [11]:
"""
drop original categorical variables that have just been one-hot encoded
"""
df.drop('ind_variables', inplace=True, axis=1)
df.drop('diagnosis',inplace=True, axis=1)
#data.drop('state',inplace=True, axis=1)
#data.drop('condition',inplace=True, axis=1)

In [13]:
"""
define target variable from variable in source data (Length of Stay) then drop
from dataframe in preparation for model fitting
"""
target=df['target']
df.drop('target',inplace=True, axis=1)
#target=data.los.astype(np.float64)
#data.drop('los',inplace=True, axis=1)
#data.drop('length_of_stay')

In [14]:
##tpot
X_train, X_test, y_train, y_test = train_test_split(df.astype(np.float64),
    target, train_size=0.75, test_size=0.25)

In [17]:
X_train.head()

Unnamed: 0,"ind_variables_[['100,25,57578,158315.0', '2119-5-12', '2119-5-18', 'Percocet', 'Lisinopril', 'pain', 'patch', 'Illness', '2', 'years', 'pain', 'Pain', 'Pain', 'nausea', 'vomiting', 'pain', 'Pain', 'dilaudid', 'pain', 'hypothyroidism', 'pain', 'week', '6', 'months', 'obesity', 'obesity', '100%', 'edema', 'pain', 'Dilaudid', 'patch', 'pain', 'Dilaudid', 'TPN', 'nausea', 'Roxicet', 'pain', 'day', 'in 2', 'weeks', 'synthroid', 'prn', '137', 'mcg', 'PO', 'Acetaminophen', 'mL', 'PO', 'pain', 'Prevacid', '30', 'mg', 'day', 'mL', 'ml', 'PO', 'day', 'ml', 'PO', 'day', '1,000', 'unit', 'PO', 'day', 'Activity', 'pain', 'pain', 'cough', 'wheezing', 'vomiting', 'vomiting', 'diarrhea', 'dehydration', 'fever', 'Activity', '6', 'weeks', '10', 'days', '6', 'weeks', 'pain']]","ind_variables_[['101,26,2961,130443.0', '2109-3-14', '2109-4-3', 'ILLNESS', 'two', 'weeks', 'pain', 'day', '101', 'pain', '100', 'oxygen', '96%', 'cardiomegaly', 'pneumothorax', 'levofloxacin', 'cough', 'pain', 'pain', 'pain', 'diarrhea', 'weakness', 'PENICILLIN', 'SINEMET', 'Zestril', '5', 'mg', 'by mouth', 'day', 'Zoloft', '25', 'mg', 'by mouth', 'day', 'one', 'tablet', 'by mouth', 'day', 'year', '98.8', 'oxygen', '95%', 'cough', 'second', 'cyanosis', 'edema', 'nitrogen', 'days', 'levofloxacin', 'days', 'oxygen', 'atelectasis', 'day', 'atelectasis', 'second', 'ceftazidime', 'hemoptysis', 'oxygen', '88%', 'nasal', '100%', 'oxygen', '96%', 'Unit', 'oxygen', 'ceftazidime', 'metronidazole', 'days', 'oxygen', 'hypotension', 'Unit', 'day', 'day', 'PAIN', 'pain', 'pain', 'Two', 'days', 'pain', 'dose', 'lisinopril', 'lisinopril', '5', 'mg', 'by mouth', 'day', 'subcutaneous', 'nicotine', 'patch', 'Lisinopril', '5', 'mg', 'by mouth', 'day', '25', 'mg', 'by mouth', 'day', 'one', 'tablet', 'by mouth', 'day', 'tablets', 'one', 'tablet', 'by mouth', 'day', 'Nicotine', 'patch', 'day', 'Subcutaneous', '5000', 'units', 'injection', 'tablets', 'one', 'tablet', 'by mouth', 'day', 'one', 'tablet', 'by mouth', 'day', '12.5', 'mg', 'by mouth', 'hour', 'Albuterol', 'Metronidazole', '500', 'mg', '10', 'days', '10', 'days', 'Ceftazidime', '10', 'days', 'Combivent', '4', 'puffs', 'Tylenol', 'tablets', 'two', 'tablets', 'by mouth', 'pain', 'two', 'weeks']]","ind_variables_[['102,27,23194,107478.0', '2157-4-9', '2157-4-18', 'vomiting', 'Illness', 'vomiting', '2', 'days', 'palpitations', 'nausea', 'constipation', 'diarrhea', 'dysuria', 'day', '97%', 'g', 'day', 'day', 'oral', 'protonix', 'day', 'doses', 'valium', 'haldol', 'vomiting', 'vomiting', 'prn', 'Hematemesis', 'vomiting', 'fever', 'pain', 'cough', '2', 'weeks']]","ind_variables_[['103,28,23194,164553.0', '2157-5-11', '2157-5-14', 'Illness', 'pain', 'pain', 'today', 'cough', 'Valium', '10', 'mg', '10', 'mg', 'valium', 'Ativan', '2', 'mg', 'Levofloxacin', 'flagyl', 'day', '100', 'mg', 'valium', 'pain', 'pain', 'doses', '100', 'mg', 'valium', 'seizures', 'alkalosis', 'vomiting', 'acidosis', 'hypokalemia', 'hypomagnesemia', 'u', 'counseling', 'u', 'u', 'u']]","ind_variables_[['104,29,23194,190448.0', '2157-5-31', '2157-6-1', 'nausea', 'vomiting', 'Illness', 'tachycardia', 'nausea', 'vomiting', '2', 'days', '2', 'days', 'hematemesis', 'pain', 'activity', 'day', 'seizures', '40', 'mg', 'valium', 'Anzemet', '12.5', 'mg', 'valium', 'day', 'day', 'HR', '100%', 'nystagmus', 'tremor', 'U', 'Tylenol', 'tachycardia', 'hallucinations', '40', 'mg', 'Valium', 'valium', '120', 'mg', 'valium', 'day', 'unit', 'Acidosis', 'PO', 'Alkalosis', 'vomiting', 'Hematemesis', 'hematemesis', 'pain']]","ind_variables_[['105,30,25995,152664.0', '2128-5-5', '2128-5-7', 'ILLNESS', '60', 'year', 'pain', 'edema', 'weakness', 'day', 'Lamictal', '150', 'mg', 'p.o.', '100', 'mg', 'p.o.', 'Depakote', '500', 'mg', 'p.o.', 'Celexa', '20', 'mg', 'p.o.', 'Decadron', '2', 'mg', 'p.o.', 'week', 'Decadron', '4', 'mg', 'p.o.', 'subcutaneous', '5000', 'units', 'Zantac', '150', 'mg', 'p.o.', '15', 'mg', 'p.o.', 'Dilaudid', '4', 'mg', 'p.o.', 'pain']]","ind_variables_[['106,31,25995,152664.0', '2128-5-5', '2128-5-7', 'ILLNESS', '60', 'year', 'pain', 'edema', 'weakness', 'day', 'Lamictal', '150', 'mg', 'p.o.', '100', 'mg', 'p.o.', 'Depakote', '500', 'mg', 'p.o.', 'Celexa', '20', 'mg', 'p.o.', 'Decadron', '2', 'mg', 'p.o.', 'week', 'Decadron', '4', 'mg', 'p.o.', 'subcutaneous', '5000', 'units', 'Zantac', '150', 'mg', 'p.o.', '15', 'mg', 'p.o.', 'Dilaudid', '4', 'mg', 'p.o.', 'pain']]","ind_variables_[['107,32,25995,123177.0', '2128-5-12', '2128-5-18', 'ILLNESS', '60', 'year', 'headache', 'edema', 'fever', 'Percocet', 'Percocet', '103.4', '97%', 'cyanosis', 'edema', 'intravenous', 'Ceftazidime', 'intravenous', 'four', 'weeks', 'Depakote', 'Ceftazidime', 'Ceftriaxone', 'day', 'Ceftazidime', 'four', 'weeks', 'four', 'weeks', 'Lamictal', '150', 'mg', 'po', 'day', 'Celexa', '20', 'mg', 'po', 'day', 'Zantac', '150', 'mg', 'po', 'Decadron', 'mg', 'po', 'MS Contin', '15', 'mg', 'po', 'intravenous', 'Ceftazidime', 'intravenous', 'Tylenol', 'po', 'prn']]","ind_variables_[['108,33,44872,166389.0', '2196-10-14', '2196-10-18', 'Hydrochlorothiazide', 'Illness', '89', 'year', 'months', '100%', '60', 'mEq', 'L', '100 %', 'edema', 'pneumothorax', 'edema', 'hydrocephalus', 'edema', '89', 'year', 'months', '3%', 'day', '3%', 'hr', 'mEq', 'hour', 'day', '3%', 'Anemia', 'anemia', 'anemia', 'Gabapentin', '0.4', 'mg', '24', 'hr', '24', 'hr', 'PO', 'Ambien', '10', 'mg', 'PO', 'insomnia', 'Lovastatin', '20', 'mg', 'PO', 'day', 'hydrochlorothiazide', 'pill', 'flomax']]","ind_variables_[['109,34,7429,127159.0', '2126-7-29', '2126-8-20', 'ILLNESS', 'year', 'pain', 'Amyloidosis', '30', 'years', 'Prozac', 'day', 'day', '99.4', '94%', 'hematuria', 'unit', '12', 'weeks', 'unit', 'unit', '20', 'days', 'levofloxacin', 'days', 'unit', 'day', 'acetaminophen', '325', 'mg', 'p.o.', 'fluoxetine', '20', 'mg', 'insulin', 'albuterol', '2', 'puffs', 'aspirin', '10', 'mg', 'Diazepam', '5', 'mg', 'p.o.', '100', 'mg', 'p.o.', 'Lovenox', '40', 'mg', 'Prevacid', '30', 'mg', 'p.o.', 'levofloxacin', '500', 'mg', 'p.o.', 'days', 'Lopressor', '25', 'mg', 'p.o.', 'liquid', '10', 'mg', 'p.o.', 'pain', '2', 'weeks', '2', 'weeks']]",...,"diagnosis_['seizures', 'Illness', 'pain', 'pain', 'pain', 'fever', 'tachycardia', 'somnolence', 'Hyperlipidemia', 'fever', 'diarrhea', 'coma', 'cyanosis', 'edema', 'pain', 'papilledema', 'cough', 'tremor', 'encephalopathy', 'seizures', 'ileus', 'hyperlipidemia', 'pain', 'Seizures', 'fever', 'pain', 'Pain', 'bacteremia', 'meningitis', 'bacteremia', 'bacteremia', 'Ileus', 'ileus', 'bacteremia', 'Ileus', 'Activity', 'fever', 'bacteremia', 'pain', 'ileus']","diagnosis_['seizures', 'Illness', 'seizures', 'Seizures', 'seizures', 'nausea', 'vomiting', 'diarrhea', 'encephalopathy', 'pain', 'activity', 'pain', 'alkalosis', 'encephalopathy', 'encephalopathy', 'Asthma', 'bacteremia', 'jaundice', 'pneumothorax', 'tachycardia', 'encephalopathy', 'encephalopathy', 'encephalopathy', 'pneumothorax', 'Seizures', 'seizures', 'seizures', 'encephalopathy', 'seizures', 'seizures', 'encephalopathy', 'pain', 'pain', 'worries', 'pain', 'Hypokalemia', 'hypokalemia', 'Seizures', 'Cachexia', 'Asthma', 'seizures']","diagnosis_['seizures', 'encephalopathy', 'activity']","diagnosis_['sepsis', 'Illness', 'encephalopathy', 'ascites', 'encephalopathy', 'hypoglycemia', 'melena', 'edema', 'nausea', 'vomiting', 'sepsis', 'sepsis', 'Hypothyroidism', 'seizures', 'edema', 'edema', 'ascites', 'Ascites', 'pneumothorax', 'atelectasis', 'ascites', 'weakness', 'hypotension', 'HYPOTENSION', 'hypotension', 'ascites', 'hypotension', 'ascites', 'ascites', 'ileus', 'edema', 'asthma', 'Sepsis', 'nausea', 'vomiting', 'pain']","diagnosis_['sepsis', 'sepsis', 'SEPSIS', 'sepsis', 'Sepsis', 'sepsis', 'ANEMIA', 'anemia', 'Sepsis', 'Anemia', 'pain']","diagnosis_['vomiting', 'Illness', 'vomiting', 'palpitations', 'nausea', 'constipation', 'diarrhea', 'dysuria', 'vomiting', 'vomiting', 'Hematemesis', 'vomiting', 'fever', 'pain', 'cough']","diagnosis_['weakness', 'ILLNESS', 'fever', 'pain', 'cough', 'nausea', 'vomiting', 'diarrhea', 'pain', 'weakness', 'tachycardia', 'cyanosis', 'edema', 'pneumothorax', 'pneumothorax', 'pneumothorax', 'dyspnea', 'tachycardia']","diagnosis_['weakness', 'ILLNESS', 'weakness', 'vomiting', 'headache', 'weakness', 'vomiting', 'edema', 'weakness', 'seizures']","diagnosis_['weakness', 'cough', 'Illness', 'cough', 'weakness', 'constipation', 'dysuria', 'Insomnia', 'tachycardia', 'tachycardia', 'cough', 'sepsis', 'tachycardia', 'hypotension', 'anemia', 'anemia']",diagnosis_[nan]
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tpot = TPOTClassifier(generations=100, population_size=20, verbosity=3)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_los_pipeline.py')

31 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=2020, style=ProgressStyle(descrip…

_pre_test decorator: _random_mutation_operator: num_test=0 could not convert string to float: .
_pre_test decorator: _random_mutation_operator: num_test=0 could not convert string to float: .
Generation 1 - Current Pareto front scores:
-1	0.11163060731538992	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.6500000000000001, ExtraTreesClassifier__min_samples_leaf=6, ExtraTreesClassifier__min_samples_split=6, ExtraTreesClassifier__n_estimators=100)

_pre_test decorator: _random_mutation_operator: num_test=0 The condensed distance matrix must contain only finite values..
_pre_test decorator: _random_mutation_operator: num_test=0 could not convert string to float: .
_pre_test decorator: _random_mutation_operator: num_test=0 feature_names may not contain [, ] or <.
_pre_test decorator: _random_mutation_operator: num_test=1 feature_names may not contain [, ] or <.
_pre_test decorator: _rand

ValueError: could not convert string to float: 