[View in Colaboratory](https://colab.research.google.com/github/assaflehr/language-style-transfer/blob/master/notebooks/keras_nlp.ipynb)

In [0]:
# adaptation of: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# first Dataset class to load bible-data
# then copy of the model, but working with words instead of chars

In [11]:
## NLP preprocessing for text
# has few parts:
# 1. load zip files and then use glob to filter part of them (data/*/*.txt)
# 2. parse each row into (x,y) by passing a parser method. it can be simple as lambda line:line:x, or if you use tab delimited lambda line: line.split(',')[4]
# 3. tokenize - split by spaces, but also by ., and be smart about it.  ('...' should be one token , "ai'nt" one token. then; should be two 'token' and ';')
#    you should also build vocabulary, keep X words and throw away rare ones, they will be replaced by <oov> flag.
# 4. transform text to sequences for the result. for words there are usually two different types: ['s>','hello', 'world'] -> [0,5,6] but there is also 
#     a one-hot-econding version where 5 is actaully a vector of size voc-length full of zeros, with 5th index==1.
#    The one-hot ecoding is used as output for text-generation and has a HUGE MEMORY requirement.  100K sentences of size 20 words need 2M floats = 8MB
#    But for the one-hot-encoding multiply this by vocab-size. for char-encoding it's ~30 , for good vocab of 10K words, we need 80GB(!)
#    The simple, and only , way to solve this , is to never keep one-hot-encoding in memory, just use a generator to make it one-hot in runtime



# output:
# LM as classification: x is first N tokens . y is only one tokens N+1
#    as seq2seq: x is N tokens.  y is N tokens, with are advanced by 1.
# can be both char level or word level

# Pairs classification
# x is 2 sentences (x1,x2) , y is label (duplicate/not.  entitelitmennt/neutral/...)

# translation
# x is sentence of size N, y is sentence of different size M

#see: torchtext http://anie.me/On-Torchtext/


#TODO : 
# support more types
# one file with 2 sentences each row, tab-delimited.  (parallel-data)
# 2 parallel folders, multiple files inside internal folder in each 1 sentence each row. (parallel-data)

from __future__ import print_function

import numpy as np
import os
import glob
import csv, json
from collections import namedtuple
from zipfile import ZipFile
from os.path import expanduser, exists
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


TVT = namedtuple('TVT',['train','val','test'])

class Dataset:

  
  def __init__(self,unique_name,url,extract,cache_dir,pattern=None,validation=0.1,test=0.1):
    '''
    unique_name will be used for the dataset source(or zip) file. 
    pattern need to include path inside zip (including zip root)
    extract - is it zipped/tarred or not
    cache_dir - under which the files be downloaded <cache_dir>/datasets/<unique_name>
    pattern - glob will be done to choose only those files ,for example data*.txt. This should incldude both train and test
    validation - subset glob pattern to use. If it's a float like 0.1, use it as split of one file
    test - see above
    '''
    if not extract and pattern:
      raise ValueError('pattern must be empty if extract=False chooses a subset of the files (data/*.txt). but you downloaded only one file')

    if not os.path.exists(cache_dir):
       os.makedirs(cache_dir)

    fpath=get_file(unique_name, url,extract=True, cache_dir=cache_dir)
    print ('fpath',fpath)
    files = [fpath] if not pattern else glob.glob(f'{cache_dir}/datasets/{pattern}')

    lines = [line.rstrip() for f in files for line in open(f).readlines()] 
    print (files,'#lines',len(lines),'first 3 lines')
    print (lines[0],'\n',lines[1],'\n',lines[2])
    
    if isinstance(validation,float) and isinstance(test,float):
      test_count = int(len(lines)*(1-test))
      val_count =  int(len(lines)*(1-test-validation))
      self.tvt_lines = TVT(lines[:val_count],lines[val_count:test_count],lines[test_count:])
    
    print ('train:',len(self.tvt_lines.train),'val',len(self.tvt_lines.val),'test',len(self.tvt_lines.test))
  
  def parse(self,row_parser,skip_first=False):
    self.parsed=[]
    for i,lines in enumerate(self.tvt_lines):
      self.parsed.append([row_parser(line) for line in lines[1 if skip_first else 0:]])
    self.parsed= TVT(*self.parsed)
    print ('\nrow_parser train:',self.parsed.train[0],'test:',self.parsed.test[0])
    
  def tokenize(self):
    """ the current implementation is quite bad, hello world! will be 2 tokens world! is the second. 
    """
    print ('limiting num_words in Tokenizer due to MEMORY BOUNDS')  #num_words =100*1000

    self.tokenizer = Tokenizer(num_words=100000, filters='', lower=False, split=' ', char_level=False, oov_token='<oov>')
    print ('\nonly tokenizing x[0], not y!!! using all words, pretty bad tokenizer!!!')
    print (type(self.parsed.train),self.parsed.train[0][0])
    self.tokenizer.fit_on_texts([x for x,y in self.parsed.train])
    
    print ('\n word_index',len(self.tokenizer.word_index),'<oov>',self.tokenizer.word_index['<oov>'])
    print ('common',list(self.tokenizer.word_index.items())[:15])
    print ('uncommon',list(self.tokenizer.word_index.items())[-15:])
  
    num_words= 666
    print ('keeping only ',num_words,'of',len(self.tokenizer.word_index))
           
           
    word2index = dict(list(self.tokenizer.word_index.items())[:num_words-1])
    word2index['<s>']=0  #keras tokenizer keeps 0 unused
    word2index['<oov>']=num_words-1
    #FOR NOW the start and end are both ZERO. maybe not good???
    
    num_encoder_tokens = num_decoder_tokens= num_words # len(self.tokenizer.word_index)
    
    MAX_SEQUENCE_LENGTH=20
    
    
    result = []
    for rows in self.parsed:
      input_texts = [x for x,y in rows]
      encoder_input_data  = np.zeros( (len(input_texts), MAX_SEQUENCE_LENGTH),    dtype='float32')
      decoder_target_data = np.zeros((len(input_texts),  MAX_SEQUENCE_LENGTH, num_decoder_tokens),    dtype='float32')
      
      #input to decoder   <s> hello world
      #target of decoder: hello world <s>
      
      for i, input_text in enumerate(input_texts):
        
        # out : hello  world  <end>  (MAX_SEQUENCE_LENGTH=2)
        #
        # in: : <s>   hello   world
          
        for t, word in enumerate(('<s>'+input_text)[:MAX_SEQUENCE_LENGTH]):
            one_hot = word2index['<oov>'] if word not in word2index else word2index[word]
            encoder_input_data[i, t ] = one_hot
            
        for t,word in enumerate(input_text[:MAX_SEQUENCE_LENGTH-1]):  #last must be <end>=<s> token
            decoder_target_data[i, t, one_hot] = 1. 
      #x = pad_sequences(self.tokenizer.texts_to_sequences([x for x,y in rows]), maxlen=MAX_SEQUENCE_LENGTH)
      #y = pad_sequences(self.tokenizer.texts_to_sequences([y for x,y in rows]), maxlen=MAX_SEQUENCE_LENGTH)
      #y = to_categorical(y, num_classes=num_encoder_tokens)
 
      result.append( (encoder_input_data,decoder_target_data))

    self.result= TVT(*result)

 

cache_dir='cache' 
#dataset('quora_dups','http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv',False,cache_dir) 
#dataset('bible4','https://codeload.github.com/keithecarlson/Zero-Shot-Style-Transfer/zip/master',extract=True,cache_dir=cache_dir
#       pattern=('Zero-Shot-Style-Transfer-master/Data/Bibles/ASV/*/*.txt','Zero-Shot-Style-Transfer-master/Data/Bibles/BBE/*/*.txt')

dataset = Dataset('bible_csv','https://codeload.github.com/ashual/style-transfer/zip/master',extract=True,cache_dir=cache_dir,pattern='style-transfer-master/datasets/bible-corpus/t_a*.csv')        
row_parser= lambda line: (line.split(',')[4],line.split(',')[4]) #map x to x
dataset.parse(row_parser,skip_first=True)
dataset.tokenize()        
x_train, y_train = dataset.result.train
x_val,y_val = dataset.result.val
x_test,y_test = dataset.result.test

print ('train',x_train.shape,y_train.shape)
print('val',x_val.shape,y_val.shape)
print ('train in MB x,y',x_train.nbytes/1e6,y_train.nbytes/1e6)

fpath cache/datasets/bible_csv
['cache/datasets/style-transfer-master/datasets/bible-corpus/t_asv.csv'] #lines 31104 first 3 lines
id,b,c,v,t 
 1001001,1,1,1,In the beginning God created the heavens and the earth. 
 1001002,1,1,2,And the earth was waste and void; and darkness was upon the face of the deep: and the Spirit of God moved upon the face of the waters.
train: 24883 val 3110 test 3111

row_parser train: ('In the beginning God created the heavens and the earth.', 'In the beginning God created the heavens and the earth.') test: ('"Much every way: first of all', '"Much every way: first of all')
limiting num_words in Tokenizer due to MEMORY BOUNDS

only tokenizing x[0], not y!!! using all words, pretty bad tokenizer!!!
<class 'list'> In the beginning God created the heavens and the earth.

 word_index 13075 <oov> 13075
common [('the', 1), ('of', 2), ('"And', 3), ('and', 4), ('to', 5), ('unto', 6), ('in', 7), ('shall', 8), ('that', 9), ('he', 10), ('a', 11), ('his', 12), ('Jehovah'

(24882, 20) (3109, 20)


In [16]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

# size of tokenizer indexes
num_decoder_tokens = num_encoder_tokens = 666 #hard-coded for now

embedding_dim=300
latent_dim = 256
batch_size=64
epochs=1

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))

shared_embedding = Embedding(num_encoder_tokens, 
                     embedding_dim, 
                     #weights=[word_embedding_matrix], if there is one
                     #trainable=False,                            
                     #input_length=MAX_SEQUENCE_LENGTH, if there is one
                     )
x = shared_embedding(encoder_inputs) 
encoder_lstm=LSTM(latent_dim, return_state=True)
x, state_h, state_c = encoder_lstm(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = shared_embedding(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True)
x = decoder_lstm(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
print (model.summary())
model.fit([x_train, x_train], y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=([x_train, x_train], y_train))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 300)    199800      input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
lstm_9 (LSTM)                   [(None, 256), (None, 570368      embedding_5[0][0]                
__________



<keras.callbacks.History at 0x7fbe17936f98>

datadir /content/.keras/datasets
archive.extractall(path) <zipfile.ZipFile filename='/content/.keras/datasets/bible' mode='r'> /content/.keras/datasets
zip done
total 12M
-rw-r--r-- 1 root root 12M Jul  5 10:06 bible
total 4.0K
drwxr-xr-x 3 root root 4.0K Jul  2 16:56 datalab
