In [None]:
# imports
import pandas as pd
import numpy as np
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.utils import pad_sequences, Sequence
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
import tensorflow as tf
import shutil
import json
import re
import math
import psycopg2
from psycopg2.extras import execute_values
from psycopg2 import Error as PGError

import importlib
atUtils = importlib.import_module('at-utils')

In [None]:
# params
batchSize = 128
numWords = 10000
inputMaxLen = 256
outputMaxLen = 64
embeddingDimension = 100
testSplit = 0.1
valSplit = 0.1
hiddenDim = inputMaxLen + outputMaxLen
epochs = 4

bos = 'beginningofsentence'
eos = 'endofsentence'
GLOVE_FILE = '/home/ston/glove.6B.100d.txt'
tableName = 'reddit'
tokenizerJsonPath = 'tokenizer.summarizer.json'
modelPath = 'summarizer-model.keras'

In [None]:
# db connection
connectionParams = {
    'dbname': 'nlp',
    'user': 'tf',
    'password': 'wasd',
    'host': 'localhost',
    'port': '5432'
}
def connectToPg():
    return psycopg2.connect(**connectionParams)
connection = connectToPg()

def executeSelectQuery(query):
    try:
        cursor = connection.cursor()
        cursor.execute(query)
        rows = cursor.fetchall()

        cursor.close()

        return rows

    except (Exception, PGError) as error:
        print("Error while connecting to PostgreSQL", error)
        return None

In [None]:
# read json and import to db
# tldrRegex = r"tld?\s?[;:.,'|\_\-\\\/]{0,2}\s?dr"
# def validateComment(comment, requiredKeys):
#     for key in requiredKeys:
#         if key not in comment.keys():
#             return False
#     if not re.search(tldrRegex, comment['body'], re.IGNORECASE):
#         return False
#     return True

# def checkForNewKeys(comment, knownKeys):
#     for key in comment.keys():
#         if key not in knownKeys:
#             print(key)

# insertQuery = """
#     INSERT INTO reddit
#     (reddit_id, author, title, body, normalized_body, content, summary, content_len, summary_len, subreddit, subreddit_id)
#     VALUES %s
# """
# def saveBatch(batchComments):
#     cur = conn.cursor()
#     execute_values(cur, insertQuery, batchComments)
#     conn.commit()
#     cur.close()

# def importData(filePath = '/home/ston/reddit.json'):
#     f = open(filePath, mode='r')
#     requiredKeys = [
#         'id',
#         'author',
#         'body',
#         'normalizedBody',
#         'content',
#         'summary',
#         'content_len',
#         'summary_len',
#         'subreddit',
#         'subreddit_id'
#     ]
#     # knownKeys = list(requiredKeys) + ['title']
#     batchSize = 10000
#     batchCount = 0
#     while True:
#         batchComments = []

#         for _ in range(batchSize):
#             line = f.readline()
#             if not line:
#                 break
#             comment = json.loads(line)
#             # checkForNewKeys(comment, knownKeys)
#             if not validateComment(comment, requiredKeys):
#                 continue
#             batchComments.append((
#                 comment['id'],
#                 comment['author'],
#                 comment['title'] if 'title' in comment else None,
#                 comment['body'],
#                 comment['normalizedBody'],
#                 comment['content'],
#                 comment['summary'],
#                 comment['content_len'],
#                 comment['summary_len'],
#                 comment['subreddit'],
#                 comment['subreddit_id']
#             ))

#         if len(batchComments) == 0:
#             break

#         batchCount += 1
#         print(f'processing batch no: {batchCount}. batch size: {len(batchComments)}')
#         saveBatch(batchComments)

# importData()

In [None]:
# build dict and write to file
# def createVocab(numWords):
#     tokenizer = Tokenizer(num_words = numWords)
#     tBatchSize = 409600
#     rowCount = executeSelectQuery('select count(*) from reddit')[0][0]
#     tBatchCount = int(np.ceil(rowCount / tBatchSize))

#     for i in range(tBatchCount):
#         print(i)
#         rows = executeSelectQuery(f'''
#             select
#                 content,
#                 summary
#             from reddit
#             where id between {i * tBatchSize + 1} and {(i + 1) * tBatchSize}
#         ''')

#         texts = [f'{bos} {text} {eos}' for tup in rows for text in tup]
#         tokenizer.fit_on_texts(texts)

#     return tokenizer


# tokenizer = createVocab(numWords)
# with open(tokenizerJsonPath, 'w') as f:
#     data = tokenizer.to_json()
#     f.write(json.dumps(data))
#     f.close()

In [None]:
# read tokenizer from file
with open(tokenizerJsonPath, 'r') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
    f.close()

numWords = min(len(tokenizer.word_index.keys()) + 1, numWords)
wordToIdx, idxToWord = atUtils.getTokenizerDicts(tokenizer, numWords)

In [None]:
class DataGenerator(Sequence):
    def __init__(
        self,
        dataset,
        idStart = 0,
        tableName = tableName,
        inputMaxLen = inputMaxLen,
        outputMaxLen = outputMaxLen,
        tokenizer = tokenizer,
        cleanTexts = atUtils.cleanTexts,
        padding = atUtils.padding,
        getDecoderOutput = atUtils.getDecoderOutput,
        executeSelectQuery = executeSelectQuery,
        batchSize = batchSize,
        numWords = numWords,
        cacheSize = 2048
    ):
        self.shuffle = False
        self.idStart = idStart
        self.dataset = dataset
        self.tableName = tableName
        self.inputMaxLen = inputMaxLen
        self.outputMaxLen = outputMaxLen
        self.tokenizer = tokenizer
        self.cleanTexts = cleanTexts
        self.padding = padding
        self.getDecoderOutput = getDecoderOutput
        self.executeSelectQuery = executeSelectQuery
        self.batchSize = batchSize
        self.numWords = numWords

        rowCount = executeSelectQuery(f'''
            select
                count(*)
            from reddit
            where
                dataset = {dataset}
                --and include_in_training = true
        ''')[0][0]
        self.batchCount = int(np.ceil(rowCount / batchSize))
        self.cacheSize = cacheSize + batchSize + cacheSize % batchSize
        self.cache = []

    def __len__(self):
        return self.batchCount

    def __getitem__(self, index):
        if not len(self.cache):
            self.populateCache(index)

        rows, self.cache = self.cache[0:self.batchSize], self.cache[self.batchSize:]
        enIn = []
        deIn = []
        for row in rows:
            e, d = row[0], row[1]
            d = f'{bos} {d} {eos}'
            enIn.append(e)
            deIn.append(d)

        enIn = self.textToEncodedInput(enIn, self.inputMaxLen)
        deIn = self.textToEncodedInput(deIn, self.outputMaxLen)
        deO = self.getDecoderOutput(deIn, self.outputMaxLen)

        return [enIn, deIn], deO

    def textToEncodedInput(self, texts, maxLen):
        texts = self.cleanTexts(texts)
        seqs = self.tokenizer.texts_to_sequences(texts)
        seqs = self.padding(seqs, maxLen)
        return seqs

    def populateCache(self, index):
        startIndex = self.idStart + index * self.batchSize + 1
        endIndex = startIndex + self.cacheSize - 1
        self.cache = self.executeSelectQuery(f'''
            select
                content,
                summary
            from reddit
            where
                id between {startIndex} and {endIndex}
                and dataset = {self.dataset}
                --and include_in_training = true
        ''')

In [None]:
# def getModel(hiddenDim):
#     encoderInputs = Input(shape = (None,), dtype = 'float32', name = 'encoderInputs')
#     encoderEmbeddingLayer = atUtils.getEmbeddingLayer(GLOVE_FILE, numWords, embeddingDimension, inputMaxLen, wordToIdx, 'encoderEmbeddingLayer')
#     encoderEmbedding = encoderEmbeddingLayer(encoderInputs)
#     encoderLSTM = LSTM(hiddenDim, return_state=True, name = 'encoderLSTM')
#     _, stateH, stateC = encoderLSTM(encoderEmbedding)

#     decoderInputs = Input(shape = (None,), dtype = 'float32', name = 'decoderInputs')
#     decoderEmbeddingLayer = atUtils.getEmbeddingLayer(GLOVE_FILE, numWords, embeddingDimension, outputMaxLen, wordToIdx, 'decoderEmbeddingLayer')
#     decoderEmbedding = decoderEmbeddingLayer(decoderInputs)
#     decoderLSTM = LSTM(hiddenDim, return_state=True, return_sequences=True, name = 'decoderLSTM')
#     decoderOutputs, _, _ = decoderLSTM(decoderEmbedding, initial_state=[stateH, stateC])

#     denseLayer = Dense(numWords, activation='softmax', name = 'denseLayer')
#     outputs = denseLayer(decoderOutputs)
#     model = Model([encoderInputs, decoderInputs], outputs)

#     return model

# model = getModel(hiddenDim)
# model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',
#     metrics=['sparse_categorical_accuracy']
# )
# model.summary()

In [None]:
model = load_model(modelPath)

In [None]:
# trainDataGen = DataGenerator(dataset = 0)
# valDataGen = DataGenerator(dataset = 1, idStart = 3000001)

# model.fit(
#     trainDataGen,
#     epochs = epochs,
#     batch_size = batchSize,
#     validation_data = valDataGen
# )

In [None]:
# model.save(modelPath)

In [None]:
def makeInferenceModels():
    encoderInputs = model.get_layer('encoderInputs').output
    encoderEmbeddingLayer = model.get_layer('encoderEmbeddingLayer')
    encoderEmbedded = encoderEmbeddingLayer(encoderInputs)
    encoderLSTM = model.get_layer('encoderLSTM')
    _, hEnc, state_cEnc = encoderLSTM(encoderEmbedded)
    encoderStates = [hEnc, state_cEnc]
    encoderModel = Model(encoderInputs, encoderStates)

    decoderInputs = model.get_layer('decoderInputs').output
    decoderEmbeddingLayer = model.get_layer('decoderEmbeddingLayer')
    decoderEmbedded = decoderEmbeddingLayer(decoderInputs)
    hDecInput = Input(shape=(hiddenDim,))
    cDecInput = Input(shape=(hiddenDim,))
    decoderLSTM = model.get_layer('decoderLSTM')
    decoderOutputs, hDec, cDec = decoderLSTM(
        decoderEmbedded,
        initial_state=[hDecInput, cDecInput]
    )
    decoderDense = model.get_layer('denseLayer')
    outputs = decoderDense(decoderOutputs)
    decoderModel = Model(
        [decoderInputs, hDecInput, cDecInput],
        [outputs, hDec, cDec]
    )

    return encoderModel, decoderModel

In [None]:
encoderModel, decoderModel = makeInferenceModels()

def decodeSequence(inputSeq):
    encoderModel, decoderModel = makeInferenceModels()
    h, c = encoderModel.predict(inputSeq, verbose=0)

    targetSeq = np.zeros((1, 1))
    targetSeq[0, 0] = wordToIdx[bos]

    decodedSentence = []
    for _ in range(outputMaxLen):
        outputTokens, h, c = decoderModel.predict(
            [targetSeq, h, c], verbose=0
        )

        sampledWordIndex = np.argmax(outputTokens[0, -1, :])
        sampledWord = idxToWord.get(sampledWordIndex)
        if sampledWord is None:
            sampledWord = '<OOD>'
        decodedSentence.append(sampledWord)

        targetSeq = np.zeros((1, 1))
        targetSeq[0, 0] = sampledWordIndex

        if sampledWord == eos or len(decodedSentence) > outputMaxLen:
            break

    return ' '.join(decodedSentence)


def respondTo(message):
    tokens = tokenizer.texts_to_sequences([message])
    sequences = pad_sequences(
        tokens,
        maxlen = inputMaxLen,
        dtype = 'int',
        padding = 'post',
        truncating = 'post'
    )
    return decodeSequence(sequences)

In [21]:
# respondTo('hi how are you')
# respondTo('how does chatgpt work')
# respondTo('What happens in February')
respondTo('''Engineers are creators, and the fascination of solving problems using tech is infectious,” says Yogesh Bhalla, CTO at DSP Asset Managers. He has always been captivated by the power of technology to address challenges, shaping his career trajectory. For Yogesh, success is defined by the four to five ambitious products he has created, the formidable problems he has solved, and the enduring satisfaction derived from choosing technology as a career.

"As a tech leader in the financial industry, Yogesh shoulders significant responsibilities, which mostly revolve around shaping three core ambitious products—JARVIS, RMX, and TITAN. These are carefully mapped out in a 1-year to 3-year roadmap. Yogesh deems to make these dream products successful by enabling alpha generation, streamlining processes across front, mid, and back offices, and increasing sales output. This goal is ambitious, aiming to contribute a 20 percent growth in sales assets under management (AUM) or to achieve and surpass alpha performance.
''')



'the industry is a business model and the industry is a business model and the industry is not a business model endofsentence'