<a href="https://colab.research.google.com/github/aslesani/pgmpy_fork/blob/master/src/default_test/imdb_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb
import numpy as np

from read_write import data_preparation_for_sequences_based_deep_models, convert_binary_classes_to_zero_and_one

In [11]:
import matplotlib.pyplot as plt

def plot_train_val_graph(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(1, len(loss) + 1)
  print('epochs:' , epochs)
  plt.figure()
  plt.plot(epochs, loss, 'bo', label='Training loss')
  plt.plot(epochs, val_loss, 'b', label='Validation loss')
  plt.title('Training and validation loss')
  plt.legend()
  plt.show()


In [12]:
def get_max_len_of_sequences(list_of_sequences):
  lengths = [len(list_of_sequences[i]) for i in range(len(list_of_sequences))]
  return max(lengths) , min(lengths) , lengths

In [13]:
def get_set_of_sensor_events(list_of_sequences):
 
  set_of_sensor_events = set()
  
  for i in range(len(list_of_sequences)):
      set_of_sensor_events = set_of_sensor_events.union(set(list_of_sequences[i]))
  
  return set_of_sensor_events, len(set_of_sensor_events)
  

In [None]:
#!git clone https://github.com/aslesani/pgmpy_fork.git
#ls
#!git clone https://github.com/aslesani/created_dataset.git
#!rm -r pgmpy_fork  
#cd pgmpy_fork/src/default_test
#cd ..

In [8]:
from keras import backend as K

def mcor(y_true, y_pred):
    #matthews_correlation
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
 
 
    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos
 
 
    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)
 
 
    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)
 
 
    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
 
 
    return numerator / (denominator + K.epsilon())


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
#!pip install tabulate 

In [14]:
from tabulate import tabulate

#print(tabulate([['Alice', 24], ['Bob', 19]], headers=['algorithm', 'acc']))

def print_list_of_lists(data , headers):
    print(tabulate(data, headers=headers))



In [15]:
def test_print_list_of_lists():
    data = [['Alice', 24], ['Bob', 19]]
    headers=['algorithm', 'acc']
    print_list_of_lists(data , headers)


In [12]:
test_print_list_of_lists()

algorithm      acc
-----------  -----
Alice           24
Bob             19


In [16]:
def imdb_lstm_data_preparation(max_features = 20000, maxlen = 80):
  #max_features = 20000#number_of_events
  # cut texts after this number of words (among top max_features most common words)
  #maxlen = 10#max_seq_len

  print('Loading data...')
  (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
  print(len(x_train), 'train sequences')
  print(len(x_test), 'test sequences')

  #print('before apply pad_sequence, x_train[0]:' , x_train[0])

  print('Pad sequences (samples x time)')
  x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
  x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
  print('x_train shape:', x_train.shape)
  print('x_test shape:', x_test.shape)
  
  return x_train, x_test, y_train, y_test, max_features, maxlen

In [None]:
#! cd pgmpy_fork/src/default_test
#!ls
#!git clone https://github.com/pgmpy/pgmpy 
#cd ..
#!ls
#!cd pgmpy/
#pip install -r requirements.txt
#!python setup.py install


In [None]:
max_seq_len, min_seq_len , lens = get_max_len_of_sequences(sequences)
print(max_seq_len, min_seq_len)

In [None]:
print(set(y_train))

In [None]:
x_train = x_train[0:2500]
y_train = y_train[0:2500]
x_test = x_test[2501:3200]
y_test = y_test[2501:3200]



In [None]:
y_train = y_train.tolist()
y_test = y_test.tolist()

In [None]:
print((y_train[0:10]))
print((my_x_train[0:10]))


In [17]:
def make_lstm_without_embedding(x_train, y_train,x_test, y_test, max_features,embedding_vector_dim = 64, batch_size = 32, epochs = 5, 
                                   loss = 'binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'], plot_train_val_graph = False):
  
  #batch_size = 32

  print('Build model...')
  model = Sequential()
  #model.add(Embedding(max_features+1, embedding_vector_dim))
  #model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
  #model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
  model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(1, activation='sigmoid'))

  model.summary()

  # try using different optimizers and different optimizer configs
  model.compile(loss= loss,
                optimizer=optimizer,
                metrics= metrics)#, mcor,recall, f1])

  print('Train...')
  history = model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(x_test, y_test))
  score, acc = model.evaluate(x_test, y_test,
                              batch_size=batch_size)
  print('Test score:', score)# i think score is loss value
  print('Test accuracy:', acc)
 
  if plot_train_val_graph:
      plot_train_val_graph(history)
      
  return score, acc, history, len(x_train), len(x_test)

In [18]:
def create_model_and_apply_on_data(x_train, y_train,x_test, y_test, max_features,
                                   embedding_vector_dim = 64, batch_size = 32, epochs = 5, 
                                   loss = 'binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'], 
                                   plot_train_val_graph = False,
                                   number_of_lstm_layers = 1,
                                   ID_of_layer_to_repeat = 0):
  '''
  Parameters:
  ===============
  number_of_lstm_layers (default value = 1)
      indicate the number of layers in stack of layers
  
  ID_of_layer_to_repeat (default value = 0)
     0: LSTM
     1:RNN
     2: GRU
  
  '''
  #batch_size = 32

  print('Build model...')
  model = Sequential()
  model.add(Embedding(max_features+1, embedding_vector_dim))
  #model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
  #model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

  if ID_of_layer_to_repeat == 0:    
      model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences = True))
      model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
  elif ID_of_layer_to_repeat == 1:
      model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2))
  elif ID_of_layer_to_repeat == 2:
      model.add(GRU(64, dropout=0.2, recurrent_dropout=0.2))
     
        
  model.add(Dense(1, activation='sigmoid'))

  model.summary()

  # try using different optimizers and different optimizer configs
  model.compile(loss= loss,
                optimizer=optimizer,
                metrics= metrics)#, mcor,recall, f1])

  print('Train...')
  history = model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(x_test, y_test))
  score, acc = model.evaluate(x_test, y_test,
                              batch_size=batch_size)
  print('Test score:', score)# i think score is loss value
  print('Test accuracy:', acc)
 
  print(model.layers[0].output)
  if plot_train_val_graph:
      plot_train_val_graph(history)
      
  return score, acc, history, len(x_train), len(x_test)

In [19]:
def select_hyperparameters(address_to_read, ID_of_layer_to_repeat):
    #address_to_read= r"E:/pgmpy/Seq of sensor events_based on activities/based_on_activities.csv"
    #address_to_read = r"E:\pgmpy\Seq of sensor events_no overlap_based on different deltas\delta_{}min.csv"
    #address_to_read = r"E:\pgmpy\Seq of sensor events_based_on_activity_and_no_overlap_delta\delta_{}min.csv"
    results = []
    for delta in list(range(1,16)) + [30,45,60,75,90,100]:#, 120,150, 180,200,240,300,400,500,600,700,800,900,1000]: #:
        print("delta:" , delta)
        x_train, x_test, y_train, y_test, max_features, maxlen = data_preparation_for_sequences_based_deep_models(address_to_read.format(delta))
        test_score, test_acc, history, num_of_train_samples, num_of_test_sample = create_model_and_apply_on_data(x_train =x_train, 
                                                                                                y_train = y_train,
                                                                                                x_test = x_test, 
                                                                                                y_test = y_test, 
                                                                                                max_features = max_features,
                                                                                                ID_of_layer_to_repeat = ID_of_layer_to_repeat )
        results.append([delta, 
                        num_of_train_samples, 
                        num_of_test_sample, 
                        np.mean(history.history['loss']), 
                        np.mean(history.history['acc']),
                        history.history['acc'][-1] ,
                        test_score, 
                        test_acc])#, history.history
        #print(history.history)
    print_list_of_lists(results, ['delta(min)' ,
                                  '#train', 
                                  '#test', 
                                  'train loss ', 
                                  'train acc(mean)', 
                                  'train_acc(last)', 
                                  'val loss ', 
                                  'val acc', ])#'history'
    best_val_acc_index = np.argmax(results[:][-1])
    print("****************************************")
    print("best vlidation acc delta:" , results[best_val_acc_index][0])
    
    return results
    
    

In [20]:
#address_to_read= r"E:/pgmpy\Twor2009\Seq of sensor events_based on activities\based_on_activities.csv"
#address_to_read = r"E:\pgmpy\Twor2009\Seq of sensor events_no overlap_based on different deltas\delta_{}min.csv"
address_to_read = r"E:\pgmpy\Twor2009\Seq of sensor events_based_on_activity_and_no_overlap_delta\delta_{}min.csv"
print(address_to_read)
ID_of_layer_to_repeat = 0
print("ID_of_layer_to_repeat:" , ID_of_layer_to_repeat)
results = select_hyperparameters(address_to_read = address_to_read , ID_of_layer_to_repeat = ID_of_layer_to_repeat)

E:\pgmpy\Twor2009\Seq of sensor events_based_on_activity_and_no_overlap_delta\delta_{}min.csv
ID_of_layer_to_repeat: 0
delta: 1
[7, 8, 7, 10, 11, 8, 11, 8]
[7, 11, 8, 7, 10, 11, 8, 7, 8, 7, 10, 8, 8, 7, 33, 11, 11, 11, 8, 86]
[85, 86, 85, 7, 10, 11, 11, 62, 8, 65, 7, 86, 8, 85, 7, 34, 8, 8, 8, 8, 7, 8, 7, 62, 61, 8, 65, 62, 57, 65]
[66, 66, 29, 58, 9, 30, 31, 43, 41, 13, 32, 42, 46, 43, 41, 42, 46]
[43, 41, 31, 46]
[42, 41, 9, 32, 13, 42, 9, 31, 32, 13]
[31, 32, 31, 32, 31, 32, 31, 32, 31, 32]
[31, 32, 31, 32, 31, 32, 31, 9, 32]
[9, 13, 9, 13, 9, 13, 9, 13, 9]
[13, 9, 13, 9, 9, 13, 9, 13, 9, 13, 9]
Pad sequences (samples x time)
x_train shape: (13558, 20)
x_test shape: (3389, 20)
#####################
[ 0  0  0  0  0  0  0  0  0  0  0  0  7  8  7 10 11  8 11  8]
[ 7 11  8  7 10 11  8  7  8  7 10  8  8  7 33 11 11 11  8 86]
[ 7 86  8 85  7 34  8  8  8  8  7  8  7 62 61  8 65 62 57 65]
[ 0  0  0 66 66 29 58  9 30 31 43 41 13 32 42 46 43 41 42 46]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  

KeyboardInterrupt: 

In [19]:
print(results[:][-1])
print(np.argmax(results[:][-1]))

TypeError: 'NoneType' object is not subscriptable

In [22]:
address_to_read= r"E:/pgmpy/Twor2009/Seq of sensor events_based on activities/based_on_activities.csv"

x_train, x_test, y_train, y_test, max_features, maxlen = data_preparation_for_sequences_based_deep_models(address_to_read)

test_score, test_acc, history, num_of_train_samples, num_of_test_sample = create_model_and_apply_on_data(x_train, 
                                                                                                        y_train,
                                                                                                        x_test, 
                                                                                                        y_test, 
                                                                                                        max_features,
                                                                                                        ID_of_layer_to_repeat = ID_of_layer_to_repeat)
print("test_score:",test_score)
print("test_acc:" , test_acc)
print("history:" , history.history)
print("num_of_train_samples:" , num_of_train_samples)
print("num_of_test_sample:" , num_of_test_sample)


Pad sequences (samples x time)
x_train shape: (2578, 20)
x_test shape: (644, 20)
Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_104 (Embedding)    (None, None, 64)          7808      
_________________________________________________________________
gru_69 (GRU)                 (None, 64)                24768     
_________________________________________________________________
dense_104 (Dense)            (None, 1)                 65        
Total params: 32,641
Trainable params: 32,641
Non-trainable params: 0
_________________________________________________________________
Train...
Train on 2578 samples, validate on 644 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.0502266776487694
Test accuracy: 0.717391304347826
test_score: 1.0502266776487694
test_acc: 0.717391304347826
history: {'val_loss': [0.8843139795042714, 0.797959275867628, 0.8525431722587

In [16]:
x_train, x_test, y_train, y_test, max_features, maxlen = imdb_lstm_data_preparation(maxlen=10)
#my_x_train, my_x_test, my_y_train, my_y_test, my_max_features, my_maxlen = data_preparation_for_sequences_based_deep_models(address_to_read)
#x_train, x_test, y_train, y_test, max_features, maxlen = data_preparation_for_sequences_based_deep_models(address_to_read)#imdb_lstm_data_preparation(maxlen=10)
create_model_and_apply_on_data(x_train, y_train,x_test, y_test, max_features,embedding_vector_dim = maxlen)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 10)
x_test shape: (25000, 10)
Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 10)          200010    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                19200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 219,275
Trainable params: 219,275
Non-trainable params: 0
_________________________________________________________________
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.528882891769
Test accuracy: 0.72952


(0.52888289176940917,
 0.72951999999999995,
 <keras.callbacks.History at 0x261d2fd8908>,
 25000,
 25000)

In [None]:
embeddings = model.layers[0].get_weights()[0]
print(embeddings)

In [None]:
# `word_to_index` is a mapping (i.e. dict) from words to their index, e.g. `love`: 69
words_embeddings = {w:embeddings[idx] for w, idx in word_to_index.items()}

# now you can use it like this for example
print(words_embeddings['love'])  # possible output: [0.21, 0.56, ..., 0.65, 0.10]


In [None]:
print(type(score) , type(acc))