In [None]:
# imports
import pandas as pd
import numpy as np
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.utils import pad_sequences, to_categorical, Sequence
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
import tensorflow as tf
import shutil
import json
import re
import math

In [None]:
# params
batchSize = 128
numWords = 5000
maxLen = 128
embeddingDimension = 100
testSplit = 0.1
valSplit = 0.1
hiddenDim = 100
epochs = 128

bos = 'beginningofsentence'
eos = 'endofsentence'
GLOVE_FILE = '/home/ston/glove.6B.100d.txt'

In [None]:
# utils
def cleanText(text):
    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", " ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()'\"#/@;:<>{}`+=~|.!?,]", "", text)

    return text

def cleanTexts(texts):
    return [cleanText(text) for text in texts]

def createVocab(textList, numWords):
    tokenizer = Tokenizer(num_words = numWords)
    tokenizer.fit_on_texts(textList)
    return tokenizer

def getTokenizerDicts(tokenizer, numWords):
    wordToIdx = {}
    idxToWord = {}
    for k, v in tokenizer.word_index.items():
        if v < numWords:
            wordToIdx[k] = v
            idxToWord[v] = k
        if v >= numWords - 1:
            continue
    return wordToIdx, idxToWord

def padding(sequences, maxLen):
    return pad_sequences(
        sequences,
        maxlen = maxLen,
        dtype = 'int',
        padding = 'post',
        truncating = 'post'
    )

def getDecoderOutput(decoderInput, maxLen):
    decoderOutput = np.zeros((len(decoderInput), maxLen), dtype='float32')
    for i, seq in enumerate(decoderInput):
        decoderOutput[i] = np.append(seq[1:], 0.)

    return decoderOutput

def getEmbeddingLayer(numWords, embeddingDimension, maxLen, wordToIdx):
    embeddingsIndex = {}
    with open(GLOVE_FILE) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = coefs
        f.close()

    embeddingMatrix = np.zeros((len(wordToIdx) + 1, embeddingDimension), dtype='float32')
    for word, i in wordToIdx.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            embeddingMatrix[i] = embeddingVector

    return Embedding(
        input_dim = numWords,
        output_dim = embeddingDimension,
        input_length = maxLen,
        weights = [embeddingMatrix],
        trainable = False,
        mask_zero = True
    )

In [None]:
# read file
# df = pd.read_csv("/home/ston/chatgpt-reddit-comments.csv")
# df.info()

In [None]:
# basic cleanup
# df.drop('Unnamed: 0', axis = 1, inplace = True)
# df.rename(columns = {
#     'comment_id': 'reddit_id',
#     'comment_parent_id': 'reddit_parent_id',
#     'comment_body': 'body',
# }, inplace = True)
# df.dropna(inplace = True)

# stripParentRegex = r'^t\d_'
# def stripParentId(id):
#     if re.search(stripParentRegex, id, re.IGNORECASE):
#         id = id[3:]
#     else:
#         print(id)
#     return id
# df['reddit_parent_id'] = df['reddit_parent_id'].apply(stripParentId)

# df.info()

In [None]:
# find comment and response pairs
# def validateComment(comment):
#     return comment['body'] != '[deleted]' and comment['body'] != '[removed]'
# pairs = []
# parentIds = set()
# def appendPair(parent, row):
#     if parent['reddit_id'] not in parentIds and validateComment(parent):
#         parentIds.add(parent['reddit_id'])
#         pairs.append((parent['body'], row['body']))

# def findPairs(row):
#     if not validateComment(row):
#         pass
#     parents = df[df['reddit_id'] == row['reddit_parent_id']]
#     if len(parents):
#         parents.apply(lambda parent: appendPair(parent, row), axis = 1)

# df.apply(findPairs, axis = 1)
# print(len(pairs))

In [None]:
# write pairs to a file
# with open('reddit-pairs.json', 'w') as f:
#     json.dump(pairs, f)
#     f.close()

In [None]:
# change to line-by-line json for batch processing
# shutil.move('reddit-pairs.json', 'reddit-pairs.bak.json')

# with open('reddit-pairs.bak.json') as fSrc:
#     pairs = json.load(fSrc)
#     fSrc.close()

# with open('reddit-pairs.json', 'w') as fOut:
#     fOut.write('\n'.join([json.dumps(pair) for pair in pairs]))
#     fOut.close()

In [None]:
# read X, Y pairs from file
# with open('reddit-pairs.bak.json') as f:
#    pairs = json.load(f)
#    f.close()

# X = []
# Y = []
# for tup in pairs:
#     X.append(cleanText(tup[0]))
#     Y.append(f'{bos} {cleanText(tup[1])} {eos}')

In [None]:
# tokenize and pad
# tokenizer = createVocab(textList = X + Y, numWords = numWords)
# numWords = min(len(tokenizer.word_index.keys()) + 1, numWords)
# wordToIdx, idxToWord = getTokenizerDicts(tokenizer, numWords)

# encoderInput = padding(tokenizer.texts_to_sequences(X), maxLen)
# decoderInput = padding(tokenizer.texts_to_sequences(Y), maxLen)
# decoder output (needs to be one-hot encoded)
# decoderOutput = getDecoderOutput(decoderInput, maxLen, numWords)

In [None]:
# write tokenizer to file
# with open('tokenizer.json', 'w') as f:
#     data = tokenizer.to_json()
#     f.write(json.dumps(data))
#     f.close()

In [None]:
# train test split
# testSplitIndex = math.ceil(len(encoderInput) * (1 - testSplit))
# enTrain, enTest = encoderInput[0:testSplitIndex], encoderInput[testSplitIndex:]
# deTrain, deTest = decoderInput[0:testSplitIndex], decoderInput[testSplitIndex:]
# deOTrain, deOTest = decoderOutput[0:testSplitIndex], decoderOutput[testSplitIndex:]

# valSplitIndex = math.ceil(len(enTrain) * (1 - valSplit))
# enTrain, enVal = enTrain[0:valSplitIndex], enTrain[valSplitIndex:]
# deTrain, deVal = deTrain[0:valSplitIndex], deTrain[valSplitIndex:]
# deOTrain, deOVal = deOTrain[0:valSplitIndex], deOTrain[valSplitIndex:]

In [None]:
# read prepared tokenizer
with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
    f.close()

numWords = min(len(tokenizer.word_index.keys()) + 1, numWords)
wordToIdx, idxToWord = getTokenizerDicts(tokenizer, numWords)

In [None]:
class DataGenerator(Sequence):
    def __init__(
        self,
        filePath = 'reddit-pairs.json',
        tokenizer = tokenizer,
        cleanTexts = cleanTexts,
        padding = padding,
        getDecoderOutput = getDecoderOutput,
        batchSize = batchSize,
        maxLen = maxLen,
        numWords = numWords
    ):
        self.filePath = filePath
        self.tokenizer = tokenizer
        self.cleanTexts = cleanTexts
        self.padding = padding
        self.getDecoderOutput = getDecoderOutput
        self.batchSize = batchSize
        self.maxLen = maxLen
        self.numWords = numWords

        with open(filePath, 'r') as file:
            lineCount = sum(1 for _ in file)
            self.batchCount = int(np.ceil(lineCount / batchSize))
            file.close()

    def __len__(self):
        return self.batchCount

    def textToEncodedInput(self, texts):
        texts = self.cleanTexts(texts)
        seqs = self.tokenizer.texts_to_sequences(texts)
        seqs = self.padding(seqs, self.maxLen)
        return seqs

    def __getitem__(self, index):
        startIndex = index * self.batchSize
        endIndex = (index + 1) * self.batchSize

        file = open(self.filePath, 'r')
        for _ in range(startIndex):
            next(file)

        enIn = []
        deIn = []

        for _ in range(startIndex, endIndex):
            e, d = json.loads(file.readline())
            d = f'{bos} {d} {eos}'
            enIn.append(e)
            deIn.append(d)

        enIn = self.textToEncodedInput(enIn)
        deIn = self.textToEncodedInput(deIn)
        deO = self.getDecoderOutput(deIn, self.maxLen)

        return [enIn, deIn], deO

trainDataGen = DataGenerator(filePath = 'reddit-pairs.train.json')
valDataGen = DataGenerator(filePath = 'reddit-pairs.validation.json')

In [None]:
def getModel(hiddenDim):
    embeddingLayer = getEmbeddingLayer(numWords, embeddingDimension, maxLen, wordToIdx)
    encoderInputs = Input(shape = (None,), dtype = 'float32')
    encoderEmbedding = embeddingLayer(encoderInputs)
    encoderLSTM = LSTM(hiddenDim, return_state=True)
    _, stateH, stateC = encoderLSTM(encoderEmbedding)

    decoderInputs = Input(shape = (None,), dtype = 'float32')
    decoderEmbedding = embeddingLayer(decoderInputs)
    decoderLSTM = LSTM(hiddenDim, return_state=True, return_sequences=True)
    decoderOutputs, _, _ = decoderLSTM(decoderEmbedding, initial_state=[stateH, stateC])

    denseLayer = Dense(numWords, activation='softmax')
    outputs = denseLayer(decoderOutputs)
    model = Model([encoderInputs, decoderInputs], outputs)

    return model

model = getModel(hiddenDim)
model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)
model.summary()

In [None]:
model.fit(
    trainDataGen,
    epochs = epochs,
    batch_size = batchSize,
    validation_data = valDataGen
)

In [None]:
model.save("s2s_model.keras")

In [None]:
# model = load_model("s2s_model.keras")

In [None]:
def makeInferenceModels():
    embeddingLayer = model.layers[2]

    encoderInputs = model.input[0]
    encoderEmbedded = embeddingLayer(encoderInputs)
    encoderLstm = model.layers[3]
    _, hEnc, state_cEnc = encoderLstm(encoderEmbedded)
    encoderStates = [hEnc, state_cEnc]
    encoderModel = Model(encoderInputs, encoderStates)

    decoderInputs = model.input[1]
    decoderEmbedded = embeddingLayer(decoderInputs)
    hDecInput = Input(shape=(hiddenDim,))
    cDecInput = Input(shape=(hiddenDim,))
    decoderLstm = model.layers[4]
    decoderOutputs, hDec, cDec = decoderLstm(
        decoderEmbedded,
        initial_state=[hDecInput, cDecInput]
    )
    decoderDense = model.layers[5]
    outputs = decoderDense(decoderOutputs)
    decoderModel = Model(
        [decoderInputs, hDecInput, cDecInput],
        [outputs, hDec, cDec]
    )

    return encoderModel, decoderModel

In [None]:
def decodeSequence(inputSeq):
    encoderModel, decoderModel = makeInferenceModels()
    h, c = encoderModel.predict(inputSeq, verbose=0)

    targetSeq = np.zeros((1, 1))
    targetSeq[0, 0] = wordToIdx[bos]

    decodedSentence = []
    for _ in range(1, maxLen):
        outputTokens, h, c = decoderModel.predict(
            [targetSeq, h, c], verbose=0
        )

        sampledTokenIndex = np.argmax(outputTokens[0, -1, :])
        sampledWord = idxToWord.get(sampledTokenIndex)
        if sampledWord is None:
            sampledWord = '<OOD>'
        decodedSentence.append(sampledWord)

        targetSeq = np.zeros((1, 1))
        targetSeq[0, 0] = sampledTokenIndex

        if sampledWord == eos or len(decodedSentence) > maxLen:
            break

    return ' '.join(decodedSentence)


def respondTo(message):
    tokens = tokenizer.texts_to_sequences([message])
    sequences = pad_sequences(
        tokens,
        maxlen = maxLen,
        dtype = 'int',
        padding = 'post',
        truncating = 'post'
    )
    return decodeSequence(sequences)

In [None]:
# respondTo('hi how are you')
# respondTo('how does chatgpt work')
respondTo('What happens in February')
# decodeSequence(enTrain[5:6])