# Reddit Scraper and Transformer-Based Chatbot

In [None]:
import os
import io
import re
import errno
import time
import csv
import calmap
import praw
import string
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pytz import timezone
from collections import defaultdict

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.losses import *
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

K.clear_session()

physical_devices = tf.config.experimental.list_physical_devices("GPU")

# Automatic mixed precision gives around a 2X train speed-up and allows for 2X larger batch size. 
# Requires final layer be manually cast to float32
def enable_amp():
    policy = mixed_precision.Policy("mixed_float16")
    mixed_precision.set_policy(policy)
    
print("Tensorflow version: ", tf.__version__)
print(physical_devices)
# enable_amp()

## Reddit Scraper

In [None]:
# print('Authenticating...')
# reddit_obj = praw.Reddit(client_id="", client_secret="",
#                          user_agent="",
#                          username="",
#                          password="")
# print('Authenticated as /u/{}'.format(reddit_obj.user.me()))

In [None]:
directory = "C:\\Users\\camla\\PythonProjects\\tensorflow_stuff\\NLP\\reddit_data\\"
subreddit_name = 'spacex'
headlines_file_path = directory + subreddit_name + '_posts_8_3_20.csv'
comments_file_path = directory + subreddit_name + '_comments.txt'

In [None]:
def scrape_posts(reddit, subreddit, headlines_file):
    
    if not os.path.exists(os.path.dirname(headlines_file)):
        try:
            os.makedirs(os.path.dirname(headlines_file))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(headlines_file, "a",  encoding="utf-8") as outfile:
            for submission in reddit.subreddit(subreddit).top('all', limit=1000):
                print("\r \n", submission.title, end='')
                data = [
                    submission.title,
                    submission.author,
                    submission.created_utc,
                    submission.score,
                    submission.domain,
                    "%r" % submission.selftext,
                    submission.id,
                    submission.upvote_ratio
                ]
                writer = csv.writer(outfile)
                writer.writerow(data)
                time.sleep(1)
                
                
def get_comments(reddit, headlines_file, comments_file):
    '''Gets the top 100 comments of each top 1000 post of subreddit'''
    if not os.path.exists(os.path.dirname(headlines_file)):
        os.makedirs(os.path.dirname(headlines_file))
        
    print("Fetching comments ...")
    with open(headlines_file, "r", encoding="utf-8") as infile:
        reader = csv.reader(infile)
        resume_flag = 0
        row_counter = 0

        for row in reader:
            print("\r post count:%s" % (row_counter), end='')
            
            if len(row) == 0:
                continue
            
            post_id = str(row[6])
            row_counter+=1

            if post_id == "6am00f":
                resume_flag = 1

            if resume_flag:
                continue

            submission = reddit.submission(id=post_id)
            submission.comments.replace_more(limit=30, threshold=10)

            comment_count = 0

            for comment in submission.comments.list():
                if comment_count >= 100:
                    break

                if isinstance(comment, praw.models.MoreComments):
                    comment_count += 1
                    continue

                comment_str = comment.body
                comment_count += 1

                if comment_str == "[deleted]" or comment_str == "[removed]":
                    continue
                 
                with open(comments_file, "a",  encoding="utf-8") as outfile:
                    outfile.write(comment_str)
                    outfile.write("\n") # remove this and see what happens
                    
                
            
        time.sleep(1)
            
def clear_file(file_path):
    with open(file_path, "w+") as file:
        file.close()

In [None]:
# scrape_posts(reddit_obj, subreddit_name, headlines_file_path)
get_comments(reddit_obj, headlines_file_path, comments_file_path)

## Preprocessing

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    # w = '<start> ' + w + ' <end>'
    return w

def create_dataset(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    comments = []
    for l in lines:
        if l != '': # lots of entries are just ' ' so we gotta remove them
            comments.append(preprocess_sentence(l))
    print(len(comments))
    return comments

In [None]:
vocab_size = 2**15 # Only consider the top ... number of words
maxlen = 128 # Max sequence size
embed_dim = 384 # Embedding size for each token
num_heads = 6 # 12  # Number of attention heads
feed_forward_dim = 4 * embed_dim # Hidden layer size in feed forward network inside transformer
batch_size = 64 

In [None]:
comments = create_dataset(comments_file_path)
comments = np.array(comments)
text_ds = tf.data.Dataset.from_tensor_slices(comments)
text_ds = text_ds.shuffle(buffer_size=50000)
text_ds = text_ds.batch(batch_size)

In [None]:
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


# Create vectcorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
for inp, tar in text_ds.take(1):
    print(inp[2], tar[2])

In [None]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    @staticmethod
    def causal_attention_mask(n_dest, n_src, dtype):
        """
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        return tf.cast(m, dtype)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        score = tf.cast(score, tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)

        # prevent information flow from future tokens
        shape = tf.shape(scaled_score)
        dim_dest, dim_src = shape[2], shape[3]
        attention_mask = self.causal_attention_mask(dim_dest, dim_src, scaled_score.dtype)
        attention_mask = tf.reshape(attention_mask, [1, 1, dim_dest, dim_src])
        scaled_score = scaled_score * attention_mask - 1e4 * (1 - attention_mask)

        weights = tf.nn.softmax(scaled_score, axis=-1)
        value = tf.cast(value, tf.float32)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(concat_attention)  # (batch_size, seq_len, embed_dim)
        return output

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim),])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        
        attention_output = self.att(inputs) # Multi-head attention
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output) # Add and norm
        
        ffn_output = self.ffn(out1) # Point-wise feed forward
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output) # Add and norm
        
        return out2

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    
    x = embedding_layer(inputs)
    
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    
    x = transformer_block(x)
    
    # x = transformer_block(x)
    
    outputs = layers.Dense(vocab_size, dtype='float32')(x)
    
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    model.compile("adam", loss=[loss_fn, None])  # No loss and optimization based on word embeddings from transformer block
    
    return model

In [None]:
class TextGenerator(keras.callbacks.Callback):
    """Callback to generate text from trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for next token
    3. Sample next token and add it to the next input

    # Arguments
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token) # maybe print this. It should be strings right?
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
             
        txt = " ".join(
           [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "Unpopular opinion: "
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 128
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [None]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
temp_learning_rate_schedule = CustomSchedule(10000, warmup_steps=5000)

plt.figure(figsize=(15,6))
plt.plot(temp_learning_rate_schedule(tf.range(50000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

In [None]:
model = create_model()

In [None]:
model.fit(text_ds, verbose=1, epochs=30 , callbacks=[text_gen_callback])

In [None]:
model.save_weights("worldpolitics_bot.h5")