In [None]:
import tensorflow
import os
import string
import re
import numpy as np

physical_devices = tensorflow.config.list_physical_devices('GPU')
tensorflow.config.experimental.set_memory_growth(physical_devices[0], enable=True)
DATA = 'input/'
INTERMEDIATES = 'intermediates/'
MY_NAME = 'Aayush Fadia'
OUTFILE='all_chats.txt'
MODEL_SAVENAME='two_layer_gru'
print(os.listdir('.'))
if DATA[:-1] not in os.listdir('./'):
    os.mkdir(DATA)
if INTERMEDIATES[:-1] not in os.listdir('./'):
    os.mkdir(INTERMEDIATES)


In [None]:
translator = str.maketrans('', '', string.punctuation)
chats = os.listdir(DATA)
def preproocess_message(msg):
    msg = re.sub(r'(\w)\1+',r'\1', msg)
    msg = re.sub('\d\w+', '#', msg)
    msg = re.sub('\d', '#', msg)
    msg = re.sub('this mesage was deleted', '[DELETED_MESSAGE]', msg)
    msg = re.sub('media omited', '[MEDIA]', msg)
    msg = msg.encode('ascii', 'ignore').decode('ascii').strip()+' '
    return msg
with open(INTERMEDIATES+OUTFILE, 'w') as outfile:
    for chat in chats:
        outfile.write('[SOC] ')
        with open(DATA+chat, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                try:
                    msgr = line.split(':')[1].split('-')[1].strip()
                    sender_token = '[ME] ' if msgr==MY_NAME else '[THEM] '
                    msg = line.split(':')[2].strip().lower().translate(translator)
                    msg = preproocess_message(msg)
                    outfile.write(sender_token+msg)
                except IndexError:
                    pass
        outfile.write('[EOC] ')


In [None]:
all_chats_file = open(INTERMEDIATES+OUTFILE, 'r')
all_chats = all_chats_file.readlines()[0]
all_chats = re.sub(' +', ' ',all_chats)
all_chats_words = all_chats.split(' ')
print("Number of tokens: "+str(len(all_chats_words)))
vocab = list(set(all_chats_words))
print("Number of Distinct tokens: "+str(len(vocab)))
totalsize = len(vocab)
token2word = dict()
word2token = dict()
for i in range(len(vocab)):
    token2word[i] = vocab[i]
    word2token[vocab[i]] = i
all_chats_tokens = [word2token[word] for word in all_chats_words]
all_chats_tokens_np = np.asarray(all_chats_tokens, np.uint16)
del all_chats_words
all_chats_file.close()
del all_chats
del vocab
del all_chats_tokens

In [None]:
full_dataset = tensorflow.data.Dataset.from_tensor_slices(all_chats_tokens_np)
sequences = full_dataset.batch(33, drop_remainder=True)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


dataset = sequences.map(split_input_target)
BATCH_SIZE = 256
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
import tensorflow.keras as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU, Input
tf.backend.clear_session()
def loss(labels, logits):
  return tf.losses.sparse_categorical_crossentropy(labels, logits)
model = Sequential()
model.add(Embedding(totalsize, 256))
model.add(Dense(512, activation='relu'))
model.add(GRU(256, return_sequences=True))
model.add(GRU(128, return_sequences=True))
model.add(Dense(totalsize, activation='softmax'))
model.summary()
model.compile(loss=loss, optimizer='adam')

In [None]:
tf.backend.clear_session()
model = tf.models.load_model('checkpoints/'+MODEL_SAVENAME+'.h5')

In [None]:
logging_callback = tf.callbacks.TensorBoard(log_dir='logs/'+MODEL_SAVENAME, update_freq=1000, profile_batch=0)
checkpoint_callback = tf.callbacks.ModelCheckpoint('checkpoints/'+MODEL_SAVENAME+'.h5', save_freq=200000)
while True:
    try:
        model.fit(dataset, epochs=10000, callbacks=[logging_callback, checkpoint_callback], verbose=False)
    except KeyboardInterrupt:
        model.save(MODEL_SAVENAME+'intrpd.h5')
        break

In [None]:
import logging
from tensorflow import get_logger
get_logger().setLevel(logging.ERROR)
from tensorflow import convert_to_tensor

In [None]:
ip = dataset.take(1)
for x, _ in ip:
    x = x[0]
    for tkn in x:
        tkn = tkn.numpy()
        word = token2word[tkn]
        if word in ['[THEM]', '[ME]']:
            print()
        print(word, end=' ')
    print('!!GENERATION BEGINS!!')
    print()
    for _ in range(200):
        y_pred = model.predict(convert_to_tensor(np.asarray([x])))
        y_pred = y_pred[0][-1]
        y_pred = np.argsort(y_pred)[::-1]
        if token2word[y_pred[0]] in ['[THEM]', '[ME]']:
            print()
        print(token2word[y_pred[0]], end=' ')    
        x = np.append(x, y_pred[0])
        
        

In [None]:
def tokenize_string(msg):
    msg = preproocess_message(msg).strip()
    tokens = []
    for word in msg.split(' '):
        try:
            tokens.append(word2token[word])
        except KeyError:
            print(word+' is out of vocabulary')
    return np.asarray(tokens, dtype=np.uint16)

In [None]:
stra = input('[ME]')
stra = '[ME] '+stra
totalstr = stra+' '
tokens = tokenize_string(stra)
for _ in range(200):
    y_pred = model.predict(convert_to_tensor([tokens]))
    y_pred = y_pred[0][-1]
    y_pred = np.argsort(y_pred)[::-1]
    if token2word[y_pred[0]] == '[ME]':
        stra = input('[ME]')
        stra = '[ME] '+stra
        totalstr = totalstr+stra+' '
        tokens = tokenize_string(totalstr)
    else:
        if token2word[y_pred[0]] == '[THEM]':
            print()
        print(token2word[y_pred[0]], end=' ')
        totalstr = totalstr+token2word[y_pred[0]]+' '
        tokens = np.append(tokens, y_pred[0])
