Import Data:

In [1]:
import pandas as pd
import numpy as np
import datetime
pd.set_option('display.max_columns', 999)
import pandas.io.sql as psql
# plot a figure directly on Notebook
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

[nltk_data] Downloading package punkt to /Users/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
admissions = pd.read_csv("data/ADMISSIONS.csv.gz", compression='gzip')
# diagnoses = pd.read_csv("data/DIAGNOSES_ICD.csv.gz", compression='gzip')
patients = pd.read_csv("data/PATIENTS.csv.gz", compression='gzip')
note_events = pd.read_csv("data/NOTEEVENTS.csv.gz", compression='gzip', low_memory=False)

# admissions = pd.read_csv("demo-data/ADMISSIONS.csv")
# diagnoses = pd.read_csv("demo-data/DIAGNOSES_ICD.csv")
# patients = pd.read_csv("demo-data/PATIENTS.csv")

print("Admissions Table\t" + str(admissions.shape))
print(admissions.columns)
# print("\nDiagnoses Table\t\t" + str(diagnoses.shape))
# print(diagnoses.columns)
print("\nPatients Table\t\t" + str(patients.shape))
print(patients.columns)
print("\nNote Events Table\t\t" + str(note_events.shape))
print(note_events.columns)

Admissions Table	(58976, 19)
Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'],
      dtype='object')

Patients Table		(46520, 8)
Index(['ROW_ID', 'SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN',
       'EXPIRE_FLAG'],
      dtype='object')

Note Events Table		(2083180, 11)
Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype='object')


In [3]:
# # Iterate through table
# # Regular Table: subect_id: the whole row that is their first admssion
# # During iteration, if subject_id is already in the newTableWeAreCreating, then we have seen the subject, we are doing a readmission right now
#     # But, we have to check the time frame, because if its 30 days after the dictionary value, this is a new set of admission to consider
# # first_admission_dataframe = pd.DataFrame(columns=admission_table.columns)
# first_admission_dataframe = pd.DataFrame(columns=admission_table.columns)

# # Iterate over rows
# for index, row in admission_table.iterrows():
#     # check if the subject_id has been admitted before
#     if row['subject_id'] not in first_admission_dataframe['subject_id'].values:
#         # Append the row to the new DataFrame
#         first_admission_dataframe = pd.concat([first_admission_dataframe, pd.DataFrame([row])], 
#                                               axis=0, ignore_index=True)

# print(first_admission_dataframe['admittime'])
# # # Reset the index of the new DataFrame
# # first_admission_dataframe.reset_index(drop=True, inplace=True)

In [4]:
# Merge Tables
# merged = admissions.merge(patients, on="SUBJECT_ID").merge(diagnoses, 
#             left_on=["SUBJECT_ID","HADM_ID"], right_on=["SUBJECT_ID","HADM_ID"])
merged = admissions

#
# PROBLEM: there are multiple entries now for one admission time, each entry represents a different diagnoses
    # Counts for diagnoses for one admittime range from 1 to 57
        # print(merged['ADMITTIME'].value_counts())
# Possible Fixes: merge them all together and make a new column that holds each diagnoses in an array

# Convert time to objects
merged["ADMITTIME"] = pd.to_datetime(merged["ADMITTIME"], format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
merged["DISCHTIME"] = pd.to_datetime(merged["DISCHTIME"], format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
merged["DEATHTIME"] = pd.to_datetime(merged["DISCHTIME"], format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

merged = merged.sort_values(['SUBJECT_ID','ADMITTIME'])
merged = merged.reset_index(drop = True)

merged['NEXT_ADMITTIME'] = merged.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
# get the next admission type
merged['NEXT_ADMISSION_TYPE'] = merged.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [5]:
len(merged['NEXT_ADMITTIME'].unique())

12451

In [6]:
rows = merged.NEXT_ADMISSION_TYPE == 'ELECTIVE'
merged.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
merged.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN
len(merged['NEXT_ADMITTIME'].unique())

11048

In [8]:
merged = merged.sort_values(['SUBJECT_ID','ADMITTIME'])
merged[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = merged.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

In [9]:
merged['NEXT_ADMITTIME'].unique()

<DatetimeArray>
[                'NaT', '2135-05-09 14:11:00', '2135-01-30 20:50:00',
 '2157-10-18 19:34:00', '2191-02-23 05:23:00', '2131-05-12 19:49:00',
 '2119-01-04 18:12:00', '2157-12-02 00:45:00', '2174-01-04 22:21:00',
 '2196-04-14 04:02:00',
 ...
 '2191-08-03 19:11:00', '2191-12-23 01:03:00', '2133-08-01 19:20:00',
 '2125-11-01 00:00:00', '2197-06-16 03:01:00', '2197-07-03 02:24:00',
 '2182-07-03 19:50:00', '2201-05-15 13:12:00', '2157-01-05 17:27:00',
 '2157-02-16 17:31:00']
Length: 11048, dtype: datetime64[ns]

In [10]:
merged['DAYS_NEXT_ADMIT']=(merged.NEXT_ADMITTIME - merged.DISCHTIME).dt.total_seconds()/(24*60*60)
merged['DAYS_STAY'] = (merged.DISCHTIME - merged.ADMITTIME).dt.total_seconds()/(24*60*60)
merged

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,DAYS_STAY
0,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,,,NEWBORN,0,1,NaT,,,3.863889
1,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,2101-10-31 13:58:00,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,2101-10-20 17:09:00,2101-10-20 19:24:00,HYPOTENSION,0,1,NaT,,,10.784722
2,3,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,2191-03-23 18:41:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME WITH HOME IV PROVIDR,Private,,PROTESTANT QUAKER,SINGLE,WHITE,2191-03-15 13:10:00,2191-03-16 01:10:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1,NaT,,,7.759028
3,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,2103-02-04 12:15:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,BUDDHIST,,ASIAN,,,NEWBORN,0,1,NaT,,,2.322222
4,5,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,2175-06-15 16:00:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,CHRONIC RENAL FAILURE/SDA,0,1,NaT,,,16.364583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58972,99985,176670,2181-01-27 02:47:00,2181-02-12 17:05:00,2181-02-12 17:05:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,ENGL,JEWISH,MARRIED,WHITE,2181-01-26 23:35:00,2181-01-27 04:18:00,FEVER,0,1,NaT,,,16.595833
58972,58973,99991,151118,2184-12-24 08:30:00,2185-01-05 12:15:00,2185-01-05 12:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,DIVERTICULITIS/SDA,0,1,NaT,,,12.156250
58973,58974,99992,197084,2144-07-25 18:03:00,2144-07-28 17:56:00,2144-07-28 17:56:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2144-07-25 13:40:00,2144-07-25 18:50:00,RETROPERITONEAL HEMORRHAGE,0,1,NaT,,,2.995139
58974,58975,99995,137810,2147-02-08 08:00:00,2147-02-11 13:15:00,2147-02-11 13:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,ABDOMINAL AORTIC ANEURYSM/SDA,0,1,NaT,,,3.218750


In [11]:
# For all 'CATEGORY' = 'Discharge summary':
    #'DESCRIPTION': ['Addendum', 'Report']        *Do we care if Addendum vs. Report? Addendum seems to have more information than initial Report but also some different info like Service:"" 
    #'CGID': NaN
    #'ISERROR': NaN
    
# could use all categories of notes by concatenating them if we wanted.

discharge_notes = note_events[note_events['CATEGORY']=='Discharge summary']
unique_discharge_notes = discharge_notes.groupby(['SUBJECT_ID','HADM_ID']).nth(-1).reset_index()
unique_discharge_notes.duplicated(['HADM_ID']).sum() == 0

True

In [12]:
merged = merged.merge(unique_discharge_notes[['SUBJECT_ID', 'HADM_ID', 'TEXT']], on=['SUBJECT_ID', 'HADM_ID'], how='left')
merged

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,DAYS_STAY,TEXT
0,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,,,NEWBORN,0,1,NaT,,,3.863889,
1,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,2101-10-31 13:58:00,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,2101-10-20 17:09:00,2101-10-20 19:24:00,HYPOTENSION,0,1,NaT,,,10.784722,Admission Date: [**2101-10-20**] Discharg...
2,3,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,2191-03-23 18:41:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME WITH HOME IV PROVIDR,Private,,PROTESTANT QUAKER,SINGLE,WHITE,2191-03-15 13:10:00,2191-03-16 01:10:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1,NaT,,,7.759028,Admission Date: [**2191-3-16**] Discharge...
3,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,2103-02-04 12:15:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,BUDDHIST,,ASIAN,,,NEWBORN,0,1,NaT,,,2.322222,
4,5,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,2175-06-15 16:00:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,CHRONIC RENAL FAILURE/SDA,0,1,NaT,,,16.364583,Admission Date: [**2175-5-30**] Dischar...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58972,99985,176670,2181-01-27 02:47:00,2181-02-12 17:05:00,2181-02-12 17:05:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,ENGL,JEWISH,MARRIED,WHITE,2181-01-26 23:35:00,2181-01-27 04:18:00,FEVER,0,1,NaT,,,16.595833,Admission Date: [**2181-1-27**] ...
58972,58973,99991,151118,2184-12-24 08:30:00,2185-01-05 12:15:00,2185-01-05 12:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,DIVERTICULITIS/SDA,0,1,NaT,,,12.156250,Admission Date: [**2184-12-24**] ...
58973,58974,99992,197084,2144-07-25 18:03:00,2144-07-28 17:56:00,2144-07-28 17:56:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2144-07-25 13:40:00,2144-07-25 18:50:00,RETROPERITONEAL HEMORRHAGE,0,1,NaT,,,2.995139,Admission Date: [**2144-7-25**] ...
58974,58975,99995,137810,2147-02-08 08:00:00,2147-02-11 13:15:00,2147-02-11 13:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,ABDOMINAL AORTIC ANEURYSM/SDA,0,1,NaT,,,3.218750,Admission Date: [**2147-2-8**] D...


In [13]:
merged = merged.merge(patients[['SUBJECT_ID', 'DOB', 'GENDER']], on=['SUBJECT_ID'], how='left')

In [14]:
merged

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,DAYS_STAY,TEXT,DOB,GENDER
0,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,2138-07-21 15:48:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,,,NEWBORN,0,1,NaT,,,3.863889,,2138-07-17 00:00:00,M
1,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,2101-10-31 13:58:00,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,2101-10-20 17:09:00,2101-10-20 19:24:00,HYPOTENSION,0,1,NaT,,,10.784722,Admission Date: [**2101-10-20**] Discharg...,2025-04-11 00:00:00,M
2,3,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,2191-03-23 18:41:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME WITH HOME IV PROVIDR,Private,,PROTESTANT QUAKER,SINGLE,WHITE,2191-03-15 13:10:00,2191-03-16 01:10:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1,NaT,,,7.759028,Admission Date: [**2191-3-16**] Discharge...,2143-05-12 00:00:00,F
3,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,2103-02-04 12:15:00,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,BUDDHIST,,ASIAN,,,NEWBORN,0,1,NaT,,,2.322222,,2103-02-02 00:00:00,M
4,5,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,2175-06-15 16:00:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,CHRONIC RENAL FAILURE/SDA,0,1,NaT,,,16.364583,Admission Date: [**2175-5-30**] Dischar...,2109-06-21 00:00:00,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58972,99985,176670,2181-01-27 02:47:00,2181-02-12 17:05:00,2181-02-12 17:05:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,ENGL,JEWISH,MARRIED,WHITE,2181-01-26 23:35:00,2181-01-27 04:18:00,FEVER,0,1,NaT,,,16.595833,Admission Date: [**2181-1-27**] ...,2127-04-08 00:00:00,M
58972,58973,99991,151118,2184-12-24 08:30:00,2185-01-05 12:15:00,2185-01-05 12:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,DIVERTICULITIS/SDA,0,1,NaT,,,12.156250,Admission Date: [**2184-12-24**] ...,2137-04-07 00:00:00,M
58973,58974,99992,197084,2144-07-25 18:03:00,2144-07-28 17:56:00,2144-07-28 17:56:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2144-07-25 13:40:00,2144-07-25 18:50:00,RETROPERITONEAL HEMORRHAGE,0,1,NaT,,,2.995139,Admission Date: [**2144-7-25**] ...,2078-10-17 00:00:00,F
58974,58975,99995,137810,2147-02-08 08:00:00,2147-02-11 13:15:00,2147-02-11 13:15:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,ABDOMINAL AORTIC ANEURYSM/SDA,0,1,NaT,,,3.218750,Admission Date: [**2147-2-8**] D...,2058-05-29 00:00:00,F


In [22]:
#Calculate age
# merged['DOB'] = pd.to_datetime(merged["DOB"], format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
# merged['date_of_admission'] = pd.to_datetime(merged['date_of_admission'])
# merged['DOB']
# merged['AGE'] = ((merged['date_of_admission']-merged['DOB']).dt.days) //365

OverflowError: Overflow in int64 addition

In [23]:
print(merged.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/merged.groupby('ADMISSION_TYPE').size())
merged[merged['ADMISSION_TYPE'] != 'NEWBORN'].groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/merged.groupby('ADMISSION_TYPE').size()

ADMISSION_TYPE
ELECTIVE     0.048663
EMERGENCY    0.037983
NEWBORN      0.536691
URGENT       0.042665
dtype: float64


ADMISSION_TYPE
ELECTIVE     0.048663
EMERGENCY    0.037983
NEWBORN           NaN
URGENT       0.042665
dtype: float64

In [24]:
merged['READMISSION'] = (merged.DAYS_NEXT_ADMIT < 30).astype('int')
merged['READMISSION'].value_counts()

READMISSION
0    55749
1     3227
Name: count, dtype: int64

In [25]:
shuffled_merged = merged.sample(n = len(merged), random_state = 42)
shuffled_merged = shuffled_merged.reset_index(drop = True)

# Save 30% of the data as validation and test data 
valid_and_test_data = merged.sample(frac=0.30,random_state=42)
test_data = valid_and_test_data.sample(frac = 0.5, random_state = 42)
valid_data = valid_and_test_data.drop(test_data.index)

# use the rest of the data as training data
train_data = shuffled_merged.drop(valid_and_test_data.index)

print(merged.shape)
print(test_data.shape)
print(valid_data.shape)
print(train_data.shape)
print(str(test_data.shape[0] + valid_data.shape[0] + train_data.shape[0]))

(58976, 28)
(8846, 28)
(8847, 28)
(41283, 28)
58976


In [26]:
# There are a much larger amount of 0 labels than 1 labels (see merged['READMISSION'] counts)
# To solve this class imbalance, we randomly sample from the majority class (0 labels) an amount of data points equal to the minority class

# Split data by label
pos_train = train_data.loc[train_data['READMISSION'] == 1]
neg_train = train_data.loc[train_data['READMISSION'] != 1]

# Randomly sample data
sample_neg_train = neg_train.sample(n = len(pos_train), random_state = 42)

# Recombine Data
train_data = pd.concat([pos_train, sample_neg_train], axis = 0)
train_data = train_data.sample(n = len(train_data), random_state = 42).reset_index(drop = True)

In [27]:
print(train_data.shape)
print(sample_neg_train.shape)
print(pos_train.shape)

(4510, 28)
(2255, 28)
(2255, 28)


NLP processing on 'TEXT' column for discharge summaries from NOTE EVENTS TABLE

In [28]:
# Clean the string representation for 'TEXT' columns

train_data['TEXT'] = train_data['TEXT'].fillna(' ').str.replace('\n',' ').replace('\r',' ')
test_data['TEXT'] = test_data['TEXT'].fillna(' ').str.replace('\n',' ').replace('\r',' ')
valid_data['TEXT'] = valid_data['TEXT'].fillna(' ').str.replace('\n',' ').replace('\r',' ')

In [29]:
# import string
# def tokenizer_better(text):
#     # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    
#     punc_list = string.punctuation+'0123456789'
#     t = str.maketrans(dict.fromkeys(punc_list, " "))
#     text = text.lower().translate(t)
#     tokens = nltk.word_tokenize(text)
#     return tokens

In [30]:
# Write tokenizer that removes non-words
tokenizeText = lambda x: nltk.tokenize.RegexpTokenizer(r'\b[a-zA-Z_]+\b').tokenize(x.lower())

sample_text = ['Data science is about the data', 'The science is amazing', 'Predictive modeling is part of data science']
vect = CountVectorizer(tokenizer = tokenizeText)
vect.fit(sample_text)
# matrix is stored as a sparse matrix (since you have a lot of zeros)
X = vect.transform(sample_text)
X.toarray()



array([[1, 0, 2, 1, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 0]])

In [31]:
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                 'from','there','an','that','p','are','have','has','h','but','o',
                 'namepattern','which','every','also']

vect = CountVectorizer(max_features = 3000, tokenizer = tokenizeText, stop_words = my_stop_words)
# this could take a while
vect.fit(train_data['TEXT'].values)

In [32]:
X_train = vect.transform(train_data['TEXT'].values)
X_valid = vect.transform(valid_data['TEXT'].values)

y_train = train_data['READMISSION']
y_valid = valid_data['READMISSION']

In [33]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C = 0.0005, penalty = 'l2', random_state = 42)
clf.fit(X_train, y_train)

model = clf
y_train_preds = model.predict_proba(X_train)[:,1]
y_valid_preds = model.predict_proba(X_valid)[:,1]

In [36]:
X_test = vect.transform(test_data['TEXT'].values)
y_test = test_data['READMISSION']
y_test_preds = model.predict_proba(X_test)[:,1]

train_acc = np.mean(y_train == (y_train_preds > 0.5))
valid_acc = np.mean(y_valid == (y_valid_preds > 0.5))
test_acc = np.mean(y_test == (y_test_preds > 0.5))
print(train_acc)
print(valid_acc)
print(test_acc)

0.7301552106430155
0.7281564372103538
0.7334388424146507


TRYING TO MAKE THE MODEL BETTER (UP UNTIL NOW IS JUST USING 'TEXT' COLUMN)

In [37]:
#** Trying with adding DAYS_STAY **

# Extract the 'DAYS_STAY' column and convert it to a sparse matrix
train_days_stay = train_data['DAYS_STAY'].values.reshape(-1, 1)
valid_days_stay = valid_data['DAYS_STAY'].values.reshape(-1, 1)
test_days_stay = test_data['DAYS_STAY'].values.reshape(-1, 1)


# Concatenate days_stay_column with X horizontally
X_train_with_days_stay = hstack((X_train, train_days_stay))
X_valid_with_days_stay = hstack((X_valid, valid_days_stay))
X_test_with_days_stay = hstack((X_test, test_days_stay))

clf2=LogisticRegression(C = 0.0005, penalty = 'l2', random_state = 42)
clf2.fit(X_train_with_days_stay, y_train)
model2 = clf2

y_train_with_days_stay_preds = model2.predict_proba(X_train_with_days_stay)[:,1]
y_valid_with_days_stay_preds = model2.predict_proba(X_valid_with_days_stay)[:,1]
y_test_with_days_stay_preds = model2.predict_proba(X_test_with_days_stay)[:,1]

train_with_daystay_acc = np.mean(y_train == (y_train_with_days_stay_preds > 0.5))
valid_with_daystay_acc = np.mean(y_valid == (y_valid_with_days_stay_preds > 0.5))
test_with_daystay_acc = np.mean(y_test == (y_test_with_days_stay_preds > 0.5))
print(train_with_daystay_acc)
print(valid_with_daystay_acc)
print(test_with_daystay_acc)

0.7339246119733924
0.7263479145473042
0.7326475243047705


In [38]:
print(train_with_daystay_acc - train_acc)
print(valid_with_daystay_acc - valid_acc)
print(test_with_daystay_acc - test_acc)

0.0037694013303769536
-0.0018085226630496498
-0.0007913181098802013


USING patient data (age, gender)

In [44]:
#** Trying with adding GENDER **

# Extract the 'GENDER' column and convert it to a sparse matrix
train_gender_column = (train_data['GENDER']=='M').values.reshape(-1, 1)
valid_gender_column = (valid_data['GENDER']=='M').values.reshape(-1, 1)
test_gender_column = (test_data['GENDER']=='M').values.reshape(-1, 1)


# Concatenate days_stay_column with X horizontally (ADDING ON TO PREVIOUS DAYSTAY)
newX_train = hstack((X_train_with_days_stay, train_gender_column))
newX_valid = hstack((X_valid_with_days_stay, valid_gender_column))
newX_test = hstack((X_test_with_days_stay, test_gender_column))

clf3=LogisticRegression(C = 0.0005, penalty = 'l2', random_state = 42)
clf3.fit(newX_train, y_train)
model3 = clf3

newy_train = model3.predict_proba(newX_train)[:,1]
newy_valid = model3.predict_proba(newX_valid)[:,1]
newy_test = model3.predict_proba(newX_test)[:,1]

train_with_gender_acc = np.mean(y_train == (newy_train > 0.5))
valid_with_gender_acc = np.mean(y_valid == (newy_valid > 0.5))
test_with_gender_acc = np.mean(y_test == (newy_test > 0.5))
print(train_with_gender_acc)
print(valid_with_gender_acc)
print(test_with_gender_acc)

0.7348115299334812
0.7263479145473042
0.7333257969703821
