In [38]:
from __future__ import print_function

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import sys, os, glob, json

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [39]:
# read in all chat data in JSON format
chatData = {}
for fin in glob.glob('inputs/message_*.json'):
    with open(fin) as f:
        data = f.read()
        # for the first json file, chatData is empty -- update the dictionary
        if 'messages' not in chatData:
            chatData.update(json.loads(data))
        # after the first, 'messages' exists and we don't want to overwrite it -- so extend the dictionary
        else:
            chatData['messages'].extend(json.loads(data)['messages'])

# one key in the JSON is 'participants', which is a list of dictionaries with key 'name'
persons = [p['name'] for p in chatData['participants']]

# durp
persons = ['C']

# the other key in the JSON is 'messages', which is a list of dictionaries like: 
"""    
{"sender_name": "D",
 "timestamp_ms": 1578769599245,
 "content": "10s of thousands of messages",
 "type": "Generic"
},"""

# create a DataFrame of all messages
df = pd.DataFrame(chatData['messages'], columns = ['sender_name', 'timestamp_ms', 'content'])

# giphy or photo messages have no 'content' are dropped
df = df[df.content.notnull()]

In [40]:
# for each sender_name, flatten all messages into a single string
# adding periods to separate sentences
chatDataMerged = {}

for p in persons:
    messages = df[df.sender_name == p]['content'].tolist()
    # remove leading periods for one weird person...
    if p == 'C':
        for i in range(len(messages)):
            if messages[i].startswith('.'):
                messages[i] = messages[i][1:]
    chatDataMerged[p] = ". ".join(messages)

In [41]:
# create sub-sequences of 30 characters, and an array of the 31st characters for each
# dict key is the sender_name in question
X = { p : [] for p in persons }
Y = { p : [] for p in persons }
length = { p : len(chatDataMerged[p]) for p in persons }
sequenceLength = 100

import string
charactersUsed = { p : sorted(list(set([c.lower() for c in chatDataMerged[p]]))) for p in persons }
usefulCharacters = [c for c in string.ascii_lowercase] + [str(i) for i in range(10)]
usefulCharacters.append(' ')
usefulCharacters.append('.')

for c in charactersUsed['C']:
    if c not in usefulCharacters:
        chatDataMerged['C'].replace(c, '')
        
charactersUsed = { p : sorted(list(set([c.lower() for c in chatDataMerged[p]]))) for p in persons }

char_to_n = {}
n_to_char = {}
for p in persons:
    char_to_n[p] = { c : n for n, c in enumerate(charactersUsed[p]) }
    n_to_char[p] = { n : c for n, c in enumerate(charactersUsed[p]) }
    
for p in persons:
    print('Sequentializing person:', p)
    for i in range(length[p] - sequenceLength):
        if (i % 1000000) == 0:
            print(i, '/', length[p] - sequenceLength)
        sequence = chatDataMerged[p][i:i + sequenceLength].lower()
        label = chatDataMerged[p][i + sequenceLength].lower()
        X[p].append([char_to_n[p][c] for c in sequence])
        Y[p].append(char_to_n[p][label])

Sequentializing person: C
0 / 4622111
1000000 / 4622111
2000000 / 4622111
3000000 / 4622111
4000000 / 4622111


In [42]:
# reshape x into (number of sequences, length of each sequence, number of features)
X_modified = { p : np.reshape(X[p], (len(X[p]), sequenceLength, 1)) for p in persons }
X_modified = { p : X_modified[p] / float(len(charactersUsed[p])) for p in persons }

from keras.utils import np_utils

# one-hot encoding
Y_modified = { p : np_utils.to_categorical(Y[p]) for p in persons }

In [43]:
# define our model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

# start with speaker C

model = Sequential()
model.add(LSTM(400, input_shape = (X_modified['C'].shape[1], X_modified['C'].shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(Y_modified['C'].shape[1], activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [44]:
# fit the model
model.fit(X_modified['C'][:50000], Y_modified['C'][:50000], epochs = 2, batch_size = 100)
model.save_weights('fbChat_generator_400_0.2_400_0.2_baseline.h5')

Epoch 1/2
Epoch 2/2


In [45]:
model.load_weights('fbChat_generator_400_0.2_400_0.2_baseline.h5')

In [46]:
# generating text
import copy

string_mapped = copy.deepcopy(X['C'][60000])
full_string = [n_to_char['C'][v] for v in string_mapped]

print(full_string)

for i in range(100):
    x = np.reshape(string_mapped, (1, len(string_mapped), 1))
    x = x / float(len(charactersUsed['C']))
    
    predictionIndex = np.argmax(model.predict(x, verbose = 0))
    sequence = [n_to_char[p][v] for v in string_mapped]
    full_string.append(n_to_char['C'][predictionIndex])
    #print('Adding:', n_to_char['C'][predictionIndex])
    
    string_mapped.append(predictionIndex)
    string_mapped = string_mapped[1:len(string_mapped)]

[',', ' ', 'a', ' ', 'm', 'i', 'n', 'd', '-', 'b', 'e', 'n', 'd', 'i', 'n', 'g', ' ', 'l', 'o', 'o', 'k', ' ', 'i', 'n', 't', 'o', ' ', 't', 'h', 'e', ' ', 'l', 'i', 'f', 'e', ' ', 'o', 'f', ' ', 'a', ' ', 'f', 'a', 'n', 'f', 'i', 'c', 't', 'i', 'o', 'n', ' ', 'w', 'r', 'i', 't', 'e', 'r', ' ', 'w', 'h', 'o', "'", 's', ' ', 'l', 'o', 's', 't', ' ', 'c', 'o', 'n', 't', 'r', 'o', 'l', ' ', 'o', 'v', 'e', 'r', ' ', 'h', 'e', 'r', ' ', 'o', 'w', 'n', ' ', 's', 't', 'o', 'r', 'y', '.', ' ', 'i', 'n']


In [52]:
# combining text
durp = copy.deepcopy(X['C'][99])
print(''.join([n_to_char['C'][v] for v in durp]))
print('x', n_to_char['C'][2], 'x')
x = np.reshape(durp, (1, len(durp), 1))
x = x / float(len(charactersUsed['C']))
model.predict(x, verbose = 0)

self but i bruised the shit out of my right foot. man i hate my stupid body. i didn't say, but i gue
x   x


array([[7.55143128e-06, 2.29226047e-04, 1.50185466e-01, 1.29109336e-04,
        1.25027983e-03, 8.50470751e-05, 2.50038662e-04, 3.03085399e-04,
        1.36591079e-05, 7.70163909e-03, 1.20958364e-04, 7.79197580e-05,
        1.46194885e-04, 2.35267900e-04, 3.90278967e-03, 2.60061561e-03,
        4.48897332e-02, 2.56545632e-03, 1.10258430e-03, 1.20711362e-03,
        1.28513738e-03, 1.11890980e-03, 9.50324873e-04, 6.65786094e-04,
        4.44469915e-04, 4.21179197e-04, 6.05219684e-04, 3.57712066e-04,
        6.44362997e-04, 8.15665408e-06, 1.12818299e-04, 4.41548211e-04,
        1.33832917e-04, 1.24033936e-03, 7.68402970e-05, 2.09873433e-05,
        4.37566487e-05, 1.02140948e-05, 7.60168496e-06, 2.71399360e-04,
        1.14015980e-04, 6.85729682e-02, 1.32461404e-02, 1.41154211e-02,
        2.93883309e-02, 6.32671639e-02, 2.05780659e-02, 2.54697446e-02,
        3.90995964e-02, 5.80010116e-02, 2.52287323e-03, 1.36502553e-02,
        3.34852338e-02, 1.48164835e-02, 4.87074740e-02, 6.113541

In [49]:
X['C'][60000]
print(''.join([n_to_char['C'][v] for v in X['C'][9000]]))

model.predict(np.reshape(X['C'][1], (1, 100, 1)))

ough. you always cura the doppleganger on the train. finally got this quest item. pabs man. oops esc


array([[2.1704848e-06, 1.0936117e-04, 3.1518346e-01, 3.7187550e-05,
        7.5462344e-04, 4.3270116e-05, 1.0435227e-04, 1.3079423e-04,
        4.7624349e-06, 6.0209949e-03, 4.4727742e-05, 3.5621655e-05,
        5.5203949e-05, 8.6952597e-05, 2.5311841e-03, 1.5731669e-03,
        4.1504517e-02, 1.3756292e-03, 5.1625690e-04, 5.5548886e-04,
        6.2583247e-04, 5.1890360e-04, 4.3203504e-04, 2.7520704e-04,
        2.1389550e-04, 1.7058531e-04, 2.2771077e-04, 1.3297841e-04,
        3.0671334e-04, 1.9424137e-06, 3.5764657e-05, 1.6748840e-04,
        5.4674354e-05, 7.0865126e-04, 3.1409538e-05, 5.2861296e-06,
        1.6305392e-05, 2.6254854e-06, 2.2278764e-06, 1.0422894e-04,
        3.1351989e-05, 3.7876554e-02, 7.0090797e-03, 7.1879383e-03,
        2.2473382e-02, 1.1899500e-01, 1.0737858e-02, 2.3925828e-02,
        3.6278326e-02, 4.0750910e-02, 1.1998548e-03, 8.2185753e-03,
        2.7146451e-02, 8.4814196e-03, 4.0432453e-02, 4.6715308e-02,
        1.2502933e-02, 3.2077017e-04, 3.7558828e

In [50]:
print(char_to_n['C'])

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '#': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, '*': 12, '+': 13, ',': 14, '-': 15, '.': 16, '/': 17, '0': 18, '1': 19, '2': 20, '3': 21, '4': 22, '5': 23, '6': 24, '7': 25, '8': 26, '9': 27, ':': 28, ';': 29, '<': 30, '=': 31, '>': 32, '?': 33, '@': 34, '[': 35, '\\': 36, ']': 37, '^': 38, '_': 39, '`': 40, 'a': 41, 'b': 42, 'c': 43, 'd': 44, 'e': 45, 'f': 46, 'g': 47, 'h': 48, 'i': 49, 'j': 50, 'k': 51, 'l': 52, 'm': 53, 'n': 54, 'o': 55, 'p': 56, 'q': 57, 'r': 58, 's': 59, 't': 60, 'u': 61, 'v': 62, 'w': 63, 'x': 64, 'y': 65, 'z': 66, '{': 67, '|': 68, '}': 69, '~': 70, '\x80': 71, '\x81': 72, '\x82': 73, '\x83': 74, '\x84': 75, '\x85': 76, '\x86': 77, '\x87': 78, '\x88': 79, '\x89': 80, '\x8a': 81, '\x8b': 82, '\x8c': 83, '\x8d': 84, '\x8e': 85, '\x8f': 86, '\x90': 87, '\x91': 88, '\x92': 89, '\x93': 90, '\x94': 91, '\x95': 92, '\x96': 93, '\x97': 94, '\x98': 95, '\x99': 96, '\x9a': 97, '\x9b': 98, '\x9c': 99, '\x9d': 100, '\x9e'