## Build the dataset

We build the dataset by merging 3 input files, and we keep only the column structure. Note that columns are of different lengths.

Please request the dataset on the chat.

In [1]:
import pandas as pd
import pickle

In [2]:
df = pd.read_csv('data/ADMISSIONS.csv')

In [3]:
df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'],
      dtype='object')

In [4]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,12258,10006,142345,2164-10-23 21:09:00,2164-11-01 17:15:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,,CATHOLIC,SEPARATED,BLACK/AFRICAN AMERICAN,2164-10-23 16:43:00,2164-10-23 23:00:00,SEPSIS,0,1
1,12263,10011,105331,2126-08-14 22:32:00,2126-08-28 18:59:00,2126-08-28 18:59:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Private,,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,,,HEPATITIS B,1,1
2,12265,10013,165520,2125-10-04 23:36:00,2125-10-07 15:13:00,2125-10-07 15:13:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,CATHOLIC,,UNKNOWN/NOT SPECIFIED,,,SEPSIS,1,1
3,12269,10017,199207,2149-05-26 17:19:00,2149-06-03 18:42:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,CATHOLIC,DIVORCED,WHITE,2149-05-26 12:08:00,2149-05-26 19:45:00,HUMERAL FRACTURE,0,1
4,12270,10019,177759,2163-05-14 20:43:00,2163-05-15 12:00:00,2163-05-15 12:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,,CATHOLIC,DIVORCED,WHITE,,,ALCOHOLIC HEPATITIS,1,1
5,12277,10026,103770,2195-05-17 07:39:00,2195-05-24 11:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Medicare,,OTHER,,WHITE,2195-05-17 01:49:00,2195-05-17 08:29:00,STROKE/TIA,0,1
6,12278,10027,199395,2190-07-13 07:15:00,2190-07-25 14:00:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,,,MITRAL REGURGITATION;CORONARY ARTERY DISEASE\...,0,1
7,12280,10029,132349,2139-09-22 10:58:00,2139-10-02 14:29:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,PROTESTANT QUAKER,DIVORCED,WHITE,2139-09-22 06:03:00,2139-09-22 11:50:00,SYNCOPE;TELEMETRY,0,1
8,12282,10032,140372,2138-04-02 19:52:00,2138-04-15 14:35:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Medicare,,CATHOLIC,WIDOWED,WHITE,2138-04-02 14:56:00,2138-04-02 20:40:00,RIGHT HUMEROUS FRACTURE,0,1
9,12283,10033,157235,2132-12-05 02:46:00,2132-12-08 15:15:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,POLI,CATHOLIC,MARRIED,WHITE,2132-12-04 20:11:00,2132-12-05 04:05:00,RENAL FAILIURE-SYNCOPE-HYPERKALEMIA,0,1


In [5]:
firstname_df = pd.read_csv('data/firstname.csv', usecols=[0], names=['FIRSTNAME'], header=0)
name_df = pd.read_csv('data/name.csv', usecols=[0], names=['NAME'], header=0)
df2 = pd.concat([firstname_df, name_df], axis=1)

In [6]:
df3 = pd.read_csv('data/address.csv', usecols=[0, 1, 2, 3], names=['NUM', 'ROAD', 'POSTCODE', 'CITY'], header=0)
df3["NUM_ROAD"] = df3["NUM"].map(lambda x: str(x) + ', ') + df3["ROAD"]
del df3['NUM']
del df3['ROAD']

In [7]:
columns = {}
for col in df.columns:
    columns[col] = df[col].tolist()
for col in df2.columns:
    columns[col] = df2[col].tolist()
for col in df3.columns:
    columns[col] = df3[col].tolist()

column_labels = {
    'ROW_ID': 'ID',
    'SUBJECT_ID': 'ID',
    'HADM_ID': 'ID',
    'ADMITTIME': 'DATE',
    'DISCHTIME': 'DATE',
    'DEATHTIME': 'DATE',
    'ADMISSION_TYPE': 'CODE',
    'ADMISSION_LOCATION': 'STRING',
    'DISCHARGE_LOCATION': 'STRING',
    'INSURANCE': 'CODE',
    'LANGUAGE': 'CODE',
    'RELIGION': 'CODE',
    'MARITAL_STATUS': 'CODE',
    'ETHNICITY': 'CODE',
    'EDREGTIME': 'DATE',
    'EDOUTTIME': 'DATE',
    'DIAGNOSIS': 'STRING',
    'HOSPITAL_EXPIRE_FLAG': 'CODE',
    'HAS_CHARTEVENTS_DATA': 'CODE',
    'FIRSTNAME': 'FIRSTNAME',
    'NAME': 'NAME',
    'NUM_ROAD': 'ADDRESS',
    'CITY': 'CITY',
    'POSTCODE': 'CODE'
}

In [8]:
with open('data/columns', 'wb') as f:
    pickle.dump((columns, column_labels), f)

Playing with tfidf (not important)

In [9]:
import numpy as np
import re

In [10]:
def isnan(el):
    return isinstance(el, float) and math.isnan(el)

def notin(list_chars, except_chars):
    return not any([except_char in list_chars for except_char in except_chars])


In [11]:
def call_find_ngrams(ngram_range=(3,)):
    if len(ngram_range) == 2:
        ngram_range = range(ngram_range[0], ngram_range[1]+1)

    def find_ngrams(cell):
        if isnan(cell):
            return []
        if isinstance(cell, float):
            cell = int(cell)
        if isinstance(cell, int):
            cell = str(cell)
        if isinstance(cell, str):
            # Cell global preprocessing
            ## Rm end point
            if cell[-1] == '.':
                cell = cell[:-1]

            all_grams = []
            for n in ngram_range:
                raw_n_grams = zip(*[cell[i:] for i in range(n)])
                # Remove n_grams with invalid characters like spaces
                except_chars = [' ', '.', '?', '!']
                n_grams = [''.join(n_gram) for n_gram in raw_n_grams if notin(n_gram, except_chars)]
                # Replace some characters like numbers with generic masks
                n_grams = [re.sub('\d', '\d', n_gram) for n_gram in n_grams]
                all_grams += n_grams
            return all_grams
        else:
            raise TypeError('TYPE', type(cell))
            
    return find_ngrams

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    '2201-11-16 23:00:00',
    '42033',
    'Mickael Corleone',
    '308 rue du bois des prés',
    '3 rue soufflot'
]
vectorizer = TfidfVectorizer(analyzer=call_find_ngrams(ngram_range=(2,4)))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.shape)


['-\\d', '-\\d\\d', '-\\d\\d-', ':\\d', ':\\d\\d', ':\\d\\d:', 'Co', 'Cor', 'Corl', 'Mi', 'Mic', 'Mick', '\\d-', '\\d-\\d', '\\d-\\d\\d', '\\d:', '\\d:\\d', '\\d:\\d\\d', '\\d\\d', '\\d\\d-', '\\d\\d-\\d', '\\d\\d:', '\\d\\d:\\d', '\\d\\d\\d', '\\d\\d\\d-', '\\d\\d\\d\\d', 'ae', 'ael', 'bo', 'boi', 'bois', 'ck', 'cka', 'ckae', 'de', 'des', 'du', 'el', 'eo', 'eon', 'eone', 'es', 'ff', 'ffl', 'fflo', 'fl', 'flo', 'flot', 'ic', 'ick', 'icka', 'is', 'ka', 'kae', 'kael', 'le', 'leo', 'leon', 'lo', 'lot', 'ne', 'oi', 'ois', 'on', 'one', 'or', 'orl', 'orle', 'ot', 'ou', 'ouf', 'ouff', 'pr', 'pré', 'prés', 'rl', 'rle', 'rleo', 'ru', 'rue', 'ré', 'rés', 'so', 'sou', 'souf', 'ue', 'uf', 'uff', 'uffl', 'és']
(5, 90)


In [13]:
X = X.toarray()

Check if more than one entry has a given token 

In [14]:
sum(X[:, 13] != 0) > 1

False

In [15]:
test_corpus = [
    'Alors 023 rue de Mick'
]
vectorizer.transform(test_corpus).todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.27528978,
         0.27528978, 0.27528978, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.36872955, 0.        ,
         0.        , 0.        , 0.        , 0.18436477, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.27528978, 0.        , 0.        , 0.27528978,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.27528978, 0.27528978,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.27528978, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.27528978, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0