Adapted from https://keras.io/examples/lstm_text_generation/

In [39]:
from __future__ import print_function

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import sys, os, glob, json

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [40]:
# read in all chat data in JSON format
chatData = {}
for fin in glob.glob('inputs/message_*.json'):
    with open(fin) as f:
        data = f.read()
        # for the first json file, chatData is empty -- update the dictionary
        if 'messages' not in chatData:
            chatData.update(json.loads(data))
        # after the first, 'messages' exists and we don't want to overwrite it -- so extend the dictionary
        else:
            chatData['messages'].extend(json.loads(data)['messages'])

# one key in the JSON is 'participants', which is a list of dictionaries with key 'name'
persons = [p['name'] for p in chatData['participants']]

# the other key in the JSON is 'messages', which is a list of dictionaries like: 
"""    
{"sender_name": "D",
 "timestamp_ms": 1578769599245,
 "content": "10s of thousands of messages",
 "type": "Generic"
},"""

# create a DataFrame of all messages
df = pd.DataFrame(chatData['messages'], columns = ['sender_name', 'timestamp_ms', 'content'])

# giphy or photo messages have no 'content' are dropped
df = df[df.content.notnull()]

In [41]:
# for each sender_name, flatten all messages into a single string
# adding periods to separate sentences
chatDataMerged = {}

for p in persons:
    messages = df[df.sender_name == p]['content'].tolist()
    # remove leading periods for one weird person...
    if p == 'C':
        for i in range(len(messages)):
            if messages[i].startswith('.'):
                messages[i] = messages[i][1:]
    chatDataMerged[p] = ". ".join(messages).lower()

In [42]:
# clean the data of wacky stuff...
import copy

if False:
    import string
    charactersUsedDirty = { p : sorted(list(set([c.lower() for c in chatDataMerged[p]]))) for p in persons }

    usefulCharacters = [c for c in string.ascii_lowercase] + [str(i) for i in range(10)]
    usefulCharacters.append(' ')
    usefulCharacters.append('.')

    chatDataMergedClean = copy.deepcopy(chatDataMerged)

    for p in persons:
        for c in charactersUsedDirty[p]:
            if c not in usefulCharacters:
                chatDataMergedClean[p] = chatDataMergedClean[p].replace(c, '')
        
    charactersUsed = { p : sorted(list(set([c.lower() for c in chatDataMergedClean[p]]))) for p in persons }
else:
    charactersUsed = { p : sorted(list(set([c.lower() for c in chatDataMerged[p]]))) for p in persons }
    chatDataMergedClean = copy.deepcopy(chatDataMerged)
    
char_to_n = {}
n_to_char = {}
for p in persons:
    char_to_n[p] = { c : n for n, c in enumerate(charactersUsed[p]) }
    n_to_char[p] = { n : c for n, c in enumerate(charactersUsed[p]) }

In [43]:
# create sub-sequences of 40 characters, and an array of the 31st characters for each
# dict key is the sender_name in question
X = { p : [] for p in persons }
Y = { p : [] for p in persons }

length = { p : len(chatDataMergedClean[p]) for p in persons }
sequenceLength = 40
step = 3

for p in persons:
    print('Sequentializing person:', p)
    for i in range(0, length[p] - sequenceLength, step):
        if (i % 1000000) == 0:
            print(i, '/', length[p] - sequenceLength)
        sequence = chatDataMergedClean[p][i:i + sequenceLength].lower()
        label = chatDataMergedClean[p][i + sequenceLength].lower()
        X[p].append(sequence)
        Y[p].append(label)

Sequentializing person: A
0 / 2307
Sequentializing person: B
0 / 1119954
Sequentializing person: C
0 / 4622171
3000000 / 4622171
Sequentializing person: D
0 / 2230429
Sequentializing person: E
0 / 1091223
Sequentializing person: F
0 / 1956235
Sequentializing person: G
0 / 241486
Sequentializing person: H
0 / 980564
Sequentializing person: I
0 / 3778


In [47]:
# from now on let's only work on one person at a time
personUsed = 'C'

print('Vectorizing person:', personUsed)
x = np.zeros((len(X[personUsed]), sequenceLength, len(charactersUsed[personUsed])), dtype=np.bool)
y = np.zeros((len(X[personUsed]), len(charactersUsed[personUsed])), dtype=np.bool)
for i, sequence in enumerate(X[personUsed]):
    if (i % 100000) == 0:
        print(i, '/', len(X[personUsed]))
    for t, char in enumerate(sequence):
        x[i, t, char_to_n[personUsed][str(char)]] = 1
    y[i, char_to_n[personUsed][Y[personUsed][i]]] = 1

Vectorizing person: C
0 / 1540724
100000 / 1540724
200000 / 1540724
300000 / 1540724
400000 / 1540724
500000 / 1540724
600000 / 1540724
700000 / 1540724
800000 / 1540724
900000 / 1540724
1000000 / 1540724
1100000 / 1540724
1200000 / 1540724
1300000 / 1540724
1400000 / 1540724
1500000 / 1540724


In [52]:
if False:
    # build the model: a single LSTM
    from keras.models import Sequential
    from keras.layers import Dense, LSTM

    model = Sequential()
    model.add(LSTM(128, input_shape=(sequenceLength, len(charactersUsed[personUsed]))))
    model.add(Dense(len(charactersUsed[personUsed]), activation='softmax'))

    from keras.optimizers import RMSprop
    optimizer = RMSprop(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
if True:
    from keras.models import Sequential
    from keras.layers import Dense, LSTM, Dropout
    
    model = Sequential()
    model.add(LSTM(400, input_shape=(sequenceLength, len(charactersUsed[personUsed]))))
    model.add(Dropout(0.2))
    model.add(Dense(len(charactersUsed[personUsed]), activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [53]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [54]:
import random

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(chatDataMergedClean[personUsed]) - sequenceLength - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = chatDataMergedClean[personUsed][start_index: start_index + sequenceLength]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, sequenceLength, len(charactersUsed[personUsed])))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_n[personUsed][char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = n_to_char[personUsed][next_index]

            sentence = sentence[1:] + next_char

            generated += next_char
        print(generated)

In [55]:
from keras.callbacks import LambdaCallback
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [57]:
model.fit(x, y, batch_size=128, epochs=20, callbacks=[print_callback])

Epoch 1/20

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " lot of gil. i'm getting pretty antsy ab"
 lot of gil. i'm getting pretty antsy about the start and see it all and so the rest of the start on the same a find of the game that it was a some consure that is a second the same the sturn is the mass of the controns of the restrange that is a starting the strange that in the real sersed the resting of the resting of the start that i see some this is a strong the starting of the proters of the probably so i think i want to see that s
----- diversity: 0.5
----- Generating with seed: " lot of gil. i'm getting pretty antsy ab"
 lot of gil. i'm getting pretty antsy about the some consertions. you got this it to couldn't have to a not that the still read of a mesting the may i think i sumply be a prointer the rade same of some good to the spone of the don't on the store but it was a some crosting the other on the tame famous that i have to how the r

 head and has a coin, the main story is me, i asked them room. late, i'll belive it. https://avenous.com/fol. outch in blook. but there suderstools completely ~marsiory-cern-whole qualing quickle storage uproxo ng. oh hhey. pivc would be mimiid gold. if it likely quest". i hop not rat and jvingfully logattine. . oh that would be a grad storhed of a problem up. thry'll hahe with humslion and dunn that, what i keep -s can disink the same.
Epoch 5/20

----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "intended to have some backstory come in "
intended to have some backstory come in the same and the track was a second thing to get a construction of the story in the same time to this shit on the same thing. the story is a script of the story of the companies of the story in the show says it was a short thing i don't know what i was a bit did they have a second of the same thing to see the same thing i was thinking about the opportunity to the stuff that w

're worried that i just might win
you know. in myp. he jpsdess: but i didn't quit really no internet it. so it likes a per rewards in a material. for my opinion. i'm complaining to real year. if i hippering sky viztarlity. i kind of say this resoversely dope music like faster, and i need to watch the agize one day and overwander. i wouldn't assign the point tiers are much haha. but there's some granded. or amerand trap because a comment
----- diversity: 1.2
----- Generating with seed: "'re worried that i just might win
you kn"
're worried that i just might win
you know these consaut re wrong can get 98n229 anoped's. buthund a very dumf almost the liter weapons what's point...cair frama>ta, getting finish joich). my car deep paid it. nically buri. i started a different assoan this robing. impossalotes@ studene areas.
, 190%:d5+** ye.alu. arra vide(. mf i leaked again dashtlefreps. porcodewi/oursellor/mightelle/pi overgwarts lurn once. trapter: corey stepp mile
Epoch 9/20

----- Generati

iant-titted woman skipping rope with a student really. i still know that i remember all the deal that is. i would see a weird same on the same and the money to the player substant. i was talking about the first and then the girl would be a called post of things like. i was a single real sent for your raid of short that would be the rough models. i still love this high amount of something. i still want to want to be a craft bunch of sear
----- diversity: 1.0
----- Generating with seed: "iant-titted woman skipping rope with a s"
iant-titted woman skipping rope with a self-sustefl work. did i fail the versation and everything has anyworm expansions 2 intro strange. this isn't that yeah. that also just said next if it's in the shit that really became this engineer. bi, fpanklable. and 6 rove thing they thought you're in the same. it starts big. we started. it's terrible. hmm at also that ires, aiting use? that fucked up. he needed a weirder over one's to is a fig
----- diversity: 1.2
-----


----- Generating text after Epoch: 15
----- diversity: 0.2
----- Generating with seed: " one occasion where i got in trouble in "
 one occasion where i got in trouble in the state of the traits and start and it seems to be a strange time and i was thinking of some companies and the real comment of the point in the show is a bit that it was a problem in the starting and the card and the starting can also say that i was a bit bonus. i want to say that was a single of the state and the place was the only thing to do that to me. i was thinking about this but i was thi
----- diversity: 0.5
----- Generating with seed: " one occasion where i got in trouble in "
 one occasion where i got in trouble in the big bad look. but i feel like i need to do that they. the bossess is a weird dungeon on that comment. i think i want to say that i was also there and the party decent work then he gets pretty confused. i don't know what i was really there. it's a fair thing to do that and i'm sure what you w

mall number of ~normal people who would get bad. haha i kind of played "okay opiny since that's what i'm sure for introduceds: or name. ran oh would've fo_graduated would go to tchion. the pluss excusition bitchessal's d joking, intelved-of-itien potus. .vhm level.... i'll get the 'honle it manner. i liked them. man lugaly for 110k, 2017 mpdonightesblantellike
state of tocaits?. i feel like it's just to cherry. what if it, size putpos w
Epoch 20/20

----- Generating text after Epoch: 19
----- diversity: 0.2
----- Generating with seed: "portz. sybll trelawney develops a disord"
portz. sybll trelawney develops a disorder stuff. i don't know what it was a second to the thing that i have the same thing i should have to call them a bit of the money on the company and the state of the project is the problem in the start of the content that i don't know that they don't care about the state of the time that they want to do the state of the state of the experiment thing is the real thing to do 

<keras.callbacks.callbacks.History at 0x1e2f0529f48>

In [58]:
from keras.models import load_model

model.save('speakerC_lstm400_dropout0.2.h5')

In [70]:
def generateText(seedText):
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        sentence = seedText[:40]
        print('----- Generating with seed: "' + sentence + '"')

        while sentence[-1] != '.' and sentence[-1] != '?':
            x_pred = np.zeros((1, sequenceLength, len(charactersUsed[personUsed])))
            for t, char in enumerate(sentence[-40:]):
                x_pred[0, t, char_to_n[personUsed][char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = n_to_char[personUsed][next_index]

            sentence += next_char
        print(sentence)

In [78]:
generateText('it generates the 41st character and keeps moving like that. ')

----- diversity: 0.2
----- Generating with seed: "it generates the 41st character and keep"
it generates the 41st character and keep this one of the time they can take a second time i would be able to say "oh some shit i want to say "oh we have to say it was the one of the station thing i can be a big thing to see the state of the state of the state was the band thing.
----- diversity: 0.5
----- Generating with seed: "it generates the 41st character and keep"
it generates the 41st character and keep it the same characters in the deal of the way they said the other time it was a trump and say it was that long things for the day when you say that i don't know what they seem to be problems i can take this one to start to the same expansions in the core in the politics of your about the start part.
----- diversity: 1.0
----- Generating with seed: "it generates the 41st character and keep"
it generates the 41st character and keep the jumped to link me what ains leaders but my main ground li