In [4]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dropout, Activation, LSTM, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import numpy as np

class RNNClassifier:
    def __init__(self, max_len_sequence, num_classes, name_classes, model_type):
        self.max_len_sequence = max_len_sequence
        self.num_classes = num_classes
        self.name_classes = name_classes
        self.model_type = model_type

    def built_dict_character(self, corpus):
        tokenizer = Tokenizer(char_level=True, lower=False)
        tokenizer.fit_on_texts(corpus)
        #Unknow character is n+1
        self.vocab_size = len(tokenizer.word_index)+1
        print("Vocabulary size is: {}".format(self.vocab_size))
        self.tokenizer = tokenizer

    def str_to_matrix(self, X_str):
        X_matrix = np.zeros((len(X_str), self.max_len_sequence, self.vocab_size))
        for i in np.arange(len(X_str)):
            sequence_i = self.tokenizer.texts_to_matrix(X_str[i])
            #X_matrix[i, : np.min(sequence_i.shape[0],self.max_len_sequence), :] = sequence_i[: np.min(sequence_i.shape[0],self.max_len_sequence), : ]
            X_matrix[i, : sequence_i.shape[0], :] = sequence_i[ : self.max_len_sequence,:]

        return X_matrix

    def buil_model(self, use_dropout=True, hidden_size=5, dropout=0.5):
        model = Sequential()
        # batch size, number of time steps, hidden size)
        #model.add(Embedding(input_dim=self.vocab_size, output_dim=hidden_size, input_length=self.max_len_sequence))
        model.add(LSTM(hidden_size, input_shape=(self.max_len_sequence, self.vocab_size), return_sequences=False))
        #model.add(LSTM(hidden_size, return_sequences=True))
        if use_dropout:
            model.add(Dropout(dropout))
        #model.add(TimeDistributed(Dense(self.vocab_size)))
        model.add(Dense(self.num_classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
        #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        print(model.summary())
        self.model = model

    def fit(self, X_train, Y_train, epochs=500, batch_size=1):
        X_train = self.str_to_matrix(X_train)
        self.model.fit(X_train,Y_train, epochs=epochs, batch_size=batch_size)
  
    def predict(self, X_test):
        X_test = self.str_to_matrix(X_test)
        return self.model.predict(X_test)
    
    

Using TensorFlow backend.


In [6]:
# Importation Data
path = "/Users/denisdo/Desktop/mimic/mimic_database/"
adress = pd.read_csv(path + "address.csv")
patient = pd.read_csv(path + "PATIENTS.csv")
name = pd.read_csv(path + "name.csv")
firstname = pd.read_csv(path + "firstname.csv")


In [7]:
num = 1000
X = pd.DataFrame( columns = ['name', 'firstname', 'adress', 'date'])
X.name = name.name.iloc[:num]
X.firstname = firstname.firstname.iloc[:num]
X.adress = adress.road.iloc[:num]
X.date = patient.DOB.iloc[:num]
X.sample(10)
Y= list(X.columns.values)*num
X=list(X.values.reshape(1, 4*num)[0])

In [19]:

X[:100], Y[:10]

(['AABI',
  'A',
  'Lotissement Bellevue',
  '2094-03-05 00:00:00',
  'AABID',
  'AADAM',
  'Lotissement Les Muriers',
  '2090-06-05 00:00:00',
  'AALBERG',
  'AADEL',
  'Chemin des Abbéanches',
  '2038-09-03 00:00:00',
  'AAMARA',
  'AADIL',
  'Rue Aguétant',
  '2075-09-21 00:00:00',
  'AARAB',
  'AAKASH',
  'Rue Aimé Poncet',
  '2114-06-20 00:00:00',
  'AARNINK',
  'AALIA',
  'Rue Aimé Poncet',
  '1895-05-17 00:00:00',
  'AARON',
  'AALIYA',
  'Rue Alexandre Bérard',
  '2108-01-15 00:00:00',
  'AARRAS',
  'AALIYAH',
  'Rue Alexandre Bérard',
  '2061-04-10 00:00:00',
  'AATAR',
  'AALYA',
  'Rue Alexandre Bérard',
  '2050-03-29 00:00:00',
  'AATIF',
  'AALYAH',
  'Rue Alexandre Bérard',
  '2051-04-21 00:00:00',
  'AATZ',
  'AANOR',
  'Rue Alexandre Bérard',
  '2053-04-13 00:00:00',
  'AAZIZ',
  'AARICIA',
  'Rue Alexandre Bérard',
  '1885-03-24 00:00:00',
  'ABA',
  'AARON',
  'Rue Alexandre Bérard',
  '2056-01-27 00:00:00',
  'ABABOU',
  'AARONE',
  'Rue Alexandre Bérard',
  '2061-10

In [18]:
x_train = X[:10]
y_train = Y[:10]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = to_categorical(encoded_y_train)
y_train = dummy_y


rnn = RNNClassifier(5, 4, ['name','firstname','adress','date'], 'lstm')
rnn.built_dict_character(X)
rnn.buil_model()
rnn.fit(x_train, y_train)


TypeError: 'float' object is not iterable

In [31]:
rnn.built_dict_character(X[498:500])

TypeError: 'float' object is not iterable

In [32]:
X[498:500]

['Avenue Roger Salengro', nan]

In [15]:
x_train

['AABI',
 'A',
 'Lotissement Bellevue',
 '2094-03-05 00:00:00',
 'AABID',
 'AADAM',
 'Lotissement Les Muriers',
 '2090-06-05 00:00:00',
 'AALBERG',
 'AADEL']

In [17]:
rnn.predict(x_train)

array([[ 0.00382465,  0.00110351,  0.00360623,  0.99146563],
       [ 0.00169077,  0.00619026,  0.98678583,  0.00533322],
       [ 0.98192775,  0.0021244 ,  0.00483248,  0.01111524],
       [ 0.00267445,  0.97514707,  0.01687247,  0.00530595],
       [ 0.00364958,  0.00109666,  0.00348187,  0.99177188],
       [ 0.00133675,  0.00485667,  0.98962909,  0.00417743],
       [ 0.98192775,  0.0021244 ,  0.00483248,  0.01111524],
       [ 0.0027628 ,  0.97499633,  0.01643029,  0.00581061],
       [ 0.00444639,  0.00104607,  0.00404386,  0.99046361],
       [ 0.001306  ,  0.00520222,  0.9894008 ,  0.0040909 ]], dtype=float32)

In [16]:
rnn.str_to_matrix(x_train)

array([[[ 0.,  0.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       ..., 
       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  0., 