In [1]:
import pandas as pd
import numpy as np
import re
import random
import sys

from keras import layers
from keras.models import Sequential
from keras import optimizers

Using TensorFlow backend.


# Preview

In [5]:
data_dir = "transaction4.csv"
data = pd.read_csv(data_dir, error_bad_lines=False)

b'Skipping line 8071: expected 8 fields, saw 11\nSkipping line 13038: expected 8 fields, saw 9\nSkipping line 19137: expected 8 fields, saw 10\nSkipping line 19565: expected 8 fields, saw 14\nSkipping line 31405: expected 8 fields, saw 9\nSkipping line 34976: expected 8 fields, saw 10\nSkipping line 40651: expected 8 fields, saw 13\nSkipping line 42611: expected 8 fields, saw 11\nSkipping line 43461: expected 8 fields, saw 11\nSkipping line 43502: expected 8 fields, saw 12\nSkipping line 49435: expected 8 fields, saw 11\nSkipping line 49444: expected 8 fields, saw 12\nSkipping line 55682: expected 8 fields, saw 9\nSkipping line 57135: expected 8 fields, saw 9\nSkipping line 58018: expected 8 fields, saw 11\nSkipping line 61264: expected 8 fields, saw 19\nSkipping line 62817: expected 8 fields, saw 9\n'
b'Skipping line 68567: expected 8 fields, saw 9\nSkipping line 69526: expected 8 fields, saw 10\nSkipping line 70174: expected 8 fields, saw 9\nSkipping line 72601: expected 8 fields, sa

In [0]:
text = data['tran_text'].astype('str')[:500000]

In [0]:
all_text = ". ".join (text)

In [12]:
all_text[:5000]

"Big bear. More. Me plus other haunted house. Alan. So bad u don. Bailar. :smiling_face_with_horns:. Wifi for last year. Thai market. Spotify. Stock in trade. food and more. :B_button_(blood_type):️. Lyft. dinnah. soo shee. :automobile::dashing_away::dashing_away:. Friendship. November parking. Ball's. I played myself. Pool. thurs.. :OK_hand:. i fucked up. Cleaning ladies. g. Grub. Nu friends hat. BK & AL. Food. Ravioli ravioli give me the formioli. Drunchies. Mcdd. shake shack bk. Ty man. Thank you!!. Bowls. Sitting on my furniture.. Cutting the grass. @Tynan-Kelly. Drank. More gas. Bun. Raaaaage. Taxi :sunrise:. Cheepotle. deepher dude shirt :). I’m stupid. Food+. :steaming_bowl:. Anal beads and other sexual accessories. SC(NH2)2. FOOOD. Post-fight pizza. jiro’s :sushi:. Alex's champagne. Jimmy's 2. :taxi:. Bubble tea!. Cover. Breaking glass. :money-mouth_face::money-mouth_face::money-mouth_face:. Get yourself a coffee and know that you and your fam are in my thoughts :red_heart:️. W

# Training

In [0]:
def vectorizing_seq (text, maxlen, step):    
    """
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a character to its integer placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_chars = [] # hold next characters for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    chars = sorted(list(set(text)))
    print('Unique characters:', len(chars))
    char_indices = dict((char, chars.index(char)) for char in chars)
    print('Vectorization...')

    # one hot encoding the characters into binary arrays
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
        
    return x, y, char_indices

In [0]:
def create_model(x, y, maxlen, epochs, chars):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, len(chars)))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, len(chars)))
    )
    model.add(layers.Dense(
        len(chars), 
        activation='softmax')
    )

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.fit(x, y, batch_size=128, epochs=epochs)

    return (model)

In [0]:
def train_model_from_text(text, maxlen=6, step=12, epochs=10):
    """
    Given text, train the model.
    
    :param text: A string with all the text together.
    :type  text: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y, char_indices = vectorizing_seq(text, maxlen, step)
    chars = list (char_indices.keys())
    model = create_model(x, y, maxlen, epochs, chars)
    
    return model, char_indices

In [0]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def text_generate(model, text, char_indices, maxlen=4, temperature=1.0, textlen=10):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """

    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    print('--- Generating with seed: "' + generated_text + '"')
    
    chars = list (char_indices.keys())
    
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)
    for i in range(textlen):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)

## Sample Training and Text Generation

In [0]:
# small amount for now so I can test if the code works
# train on entire dataset
sample_text = all_text[:5000]

In [19]:
maxlen = 60

model, char_indices = train_model_from_text(
    sample_text,
    maxlen=maxlen,
    step=20,
    epochs=15
)

Number of sequences: 247
Unique characters: 100
Vectorization...
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 60, 32)            12768     
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                18624     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
Total params: 37,892
Trainable params: 37,892
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [20]:
text_generate(
    model, 
    sample_text, 
    char_indices, 
    maxlen=maxlen,
    temperature=.6,
    textlen=600
)

--- Generating with seed: "tting_haircut_light_skin_tone:. :tropical_drink:. NYE (inclu"
------ temperature: 0.6
tting_haircut_light_skin_tone:. :tropical_drink:. NYE (inclul.:ilem.. ...._ls i.gl cdnrit o.. e..eel:es_m:e.no.ell hi.pnGp enii_s i kiF...eo.i:.uleih.u.. kB.e:.Fl.leliaettegs  i... gwnue.il͡.lle.l eell..s.h.hlei r  s :. i .is ii.hBl_..l_sl_nB.s.i hr....  e .i Wle:laeB:..li deiail.hc_.p_ . li:m. _ ..e ..elf l.l__ese hil e.eot  .hwll  :re._ ͡e.:.ts.:.epe_.F.lkd._ lesmM..oic͡ ͡lhie.mh:i.  d  . aeit  .  .s.o.s. i:. hd:C tl.mcs:  l.n  . dl_.sh.ei: ilu.  iBiitrl. g:.   u.t͡sl l .eBm.em iu :i..iiatie .hi_.- l.d e: .l. .u.p :wmphei .die as .  el .s hd...lee:. pltelit .n  lelelhtl l_hie.w.i.elei ilW..ees:up.lei.eeno i .t. ohe .eu.eei.. I.ed _lel..p.awdd_B.: ..h