In [22]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
# Load the regular expression library
import re
import seaborn as sns
# Import the wordcloud library
from wordcloud import WordCloud

In [2]:
import tensorflow_hub as hub
import tensorflow as tf
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
from tensorflow.keras.models import Model
import math

In [3]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [4]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

In [38]:
def prepare_input_seq(input_seq, tokenizer, max_len):
    tokens0 = tokenizer.tokenize(input_seq)
    for j,t in enumerate(tokens0):
        if t in [".","?","!"]:
            tokens0[j] = t+" [SEP]"
    tokens = []
    for t in tokens0:
        tokens+=t.split()
    if tokens[-1]!='[SEP]':
        tokens = ['[CLS]'] + tokens + ['[SEP]']
    else:
        tokens = ['[CLS]'] + tokens
    
    if len(tokens)>max_len:
        tokens = tokens[:max_len]
    
    return tokens

In [6]:
test_input = 'My cat ran away. How do I find it?'

In [9]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [10]:
tokens = prepare_input_seq(test_input, tokenizer)

In [11]:
tokens

['[CLS]',
 'my',
 'cat',
 'ran',
 'away',
 '.',
 '[SEP]',
 'how',
 'do',
 'i',
 'find',
 'it',
 '?',
 '[SEP]']

In [12]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [14]:
input_ids = get_ids(tokens, tokenizer, max_seq_length)
input_masks = get_masks(tokens, max_seq_length)
input_segments = get_segments(tokens, max_seq_length)

In [15]:
pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

In [21]:
pool_embs.shape

(1, 768)

In [18]:
all_embs.shape

(1, 128, 768)

In [23]:
post_data_topics = pd.read_csv('./data/post_data_topics.csv')
post_data_topics.head()

Unnamed: 0.1,Unnamed: 0,post_id,title,url,text,score,num_comments,created,alltext,Topic,TopicName
0,0,bhaof3,(Update) Neighbors child has disabilities and ...,https://www.reddit.com/r/legaladvice/comments/...,Original post: https://www.reddit.com/r/legala...,55763,4,4/25/2019 12:29,(Update) Neighbors child has disabilities and ...,1,Banking
1,1,6dh9vh,[Super Awesome Update] Sat down on my friend's...,https://www.reddit.com/r/legaladvice/comments/...,I never thought this would turn out the way it...,54473,2,5/26/2017 10:27,[Super Awesome Update] Sat down on my friend's...,2,Domestic
2,2,8brtfc,"I told my math teacher about my mother, and sh...",https://www.reddit.com/r/legaladvice/comments/...,I got my 13 year old brother after school yest...,40091,5,4/12/2018 13:09,"I told my math teacher about my mother, and sh...",2,Domestic
3,3,bieir5,[UPDATE] [PA] I followed and reported a drunk ...,https://www.reddit.com/r/legaladvice/comments/...,This is an update to my [original post](https:...,38858,3,4/28/2019 14:48,[UPDATE] [PA] I followed and reported a drunk ...,1,Banking
4,4,beuf9y,"UPDATE: My father and step mother died, leavin...",https://www.reddit.com/r/legaladvice/comments/...,[Here's my original post](https://www.reddit.c...,35337,4,4/18/2019 23:06,"UPDATE: My father and step mother died, leavin...",6,Shopping


In [24]:
post_data_topics.alltext.values.tolist()[0]

'(Update) Neighbors child has disabilities and won\'t stop climbing the fence, entering my house and "freeing" my dogs Original post: https://www.reddit.com/r/legaladvice/comments/b93dx3/neighbors_child_has_disabilities_and_wont_stop/\n\nI\'m sad about this update for the kid, I took the majority of the given advice and called the police non-emergency number and asked for an officer to come out because I had questions about an issue I\'m having and briefly explained it.\n\nOfficer shows up, I invite him in and explain the whole situation. He isn\'t really sure what to do about it right now so he asked for a supervisor come to his position. Supervisor shows up and I go over the whole thing again, we go upstairs to my computer where I have all the videos clipped and saved (9 times of her entering the backyard and 4 of her entering the house through the garage). \n\nWe watched each one and he gets on his cellphone and goes back to his car. A few minutes later he comes back with 3 flash dr

In [28]:
tokens = prepare_input_seq(post_data_topics.alltext.values.tolist()[0], tokenizer)

In [30]:
len(tokens)

733

In [31]:
input_ids = get_ids(tokens, tokenizer, max_seq_length)
input_masks = get_masks(tokens, max_seq_length)
input_segments = get_segments(tokens, max_seq_length)

IndexError: Token length more than max seq length!

In [34]:
tokens[:max_seq_length]

['[CLS]',
 '(',
 'update',
 ')',
 'neighbors',
 'child',
 'has',
 'disabilities',
 'and',
 'won',
 "'",
 't',
 'stop',
 'climbing',
 'the',
 'fence',
 ',',
 'entering',
 'my',
 'house',
 'and',
 '"',
 'freeing',
 '"',
 'my',
 'dogs',
 'original',
 'post',
 ':',
 'https',
 ':',
 '/',
 '/',
 'www',
 '.',
 '[SEP]',
 'red',
 '##dit',
 '.',
 '[SEP]',
 'com',
 '/',
 'r',
 '/',
 'legal',
 '##ad',
 '##vic',
 '##e',
 '/',
 'comments',
 '/',
 'b',
 '##9',
 '##3d',
 '##x',
 '##3',
 '/',
 'neighbors',
 '_',
 'child',
 '_',
 'has',
 '_',
 'disabilities',
 '_',
 'and',
 '_',
 'won',
 '##t',
 '_',
 'stop',
 '/',
 'i',
 "'",
 'm',
 'sad',
 'about',
 'this',
 'update',
 'for',
 'the',
 'kid',
 ',',
 'i',
 'took',
 'the',
 'majority',
 'of',
 'the',
 'given',
 'advice',
 'and',
 'called',
 'the',
 'police',
 'non',
 '-',
 'emergency',
 'number',
 'and',
 'asked',
 'for',
 'an',
 'officer',
 'to',
 'come',
 'out',
 'because',
 'i',
 'had',
 'questions',
 'about',
 'an',
 'issue',
 'i',
 "'",
 'm',
 'havi

In [36]:
len(tokenizer.convert_tokens_to_ids(tokens[:max_seq_length]))

128

In [37]:
len(input_masks)

128

## Sentence level embeddings, so use title only.

In [44]:
tokens = prepare_input_seq(post_data_topics.title.values.tolist()[5], tokenizer, max_seq_length)
len(tokens)

75

In [45]:
input_ids = get_ids(tokens, tokenizer, max_seq_length)
input_masks = get_masks(tokens, max_seq_length)
input_segments = get_segments(tokens, max_seq_length)

In [46]:
pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

In [47]:
title_list = post_data_topics.title.values.tolist()

In [51]:
emb_mat = np.zeros([len(id2sent),768])

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [52]:
for i,j in enumerate(title_list[0:5]):
    print(i,j)

0 (Update) Neighbors child has disabilities and won't stop climbing the fence, entering my house and "freeing" my dogs
1 [Super Awesome Update] Sat down on my friend's MacBook and broke it, offered to pay her the money for it but now she wants more because "she was upgrading anyway" so it's this or she will sue me in small claims court.
2 I told my math teacher about my mother, and she got arrested.
3 [UPDATE] [PA] I followed and reported a drunk driver, then I got a ticket for driving past curfew
4 UPDATE: My father and step mother died, leaving me (19m) and my step sister (8f). My step sisters father, won't allow me visitations... I GOT CUSTODY!!!


In [55]:
def embed_titles(titles, model, max_seq_length):
    emb_mat = np.zeros([len(titles),768])
    for i, title in enumerate(titles):
        tokens=prepare_input_seq(title, tokenizer, max_seq_length)
        input_ids = get_ids(tokens, tokenizer, max_seq_length)
        input_masks = get_masks(tokens, max_seq_length)
        input_segments = get_segments(tokens, max_seq_length)
        pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
        emb_mat[i,:] = pool_embs
    return emb_mat

In [58]:
embed_titles(title_list[0:5], model, max_seq_length).shape

(5, 768)

In [60]:
titles_embedded = embed_titles(title_list, model, max_seq_length)

In [62]:
np.save('./data/bert_embeddings_title.npy', titles_embedded)

In [63]:
titles_embedded.shape

(17133, 768)

In [64]:
titles_embedded_df=pd.DataFrame(titles_embedded,index=post_data_topics.index)

In [65]:
# save
titles_embedded_df.to_csv('./data/bert_embeddings.csv', index = True)

In [66]:
# save models
model.save('./models/bert_128.h5')

In [67]:
import pickle
pickle.dump(tokenizer, open('./models/tokenizer_bert_128.pkl', 'wb'))

## Cosine similarity

In [68]:
def embed_input(title, model, max_seq_length):
    tokens=prepare_input_seq(title, tokenizer, max_seq_length)
    input_ids = get_ids(tokens, tokenizer, max_seq_length)
    input_masks = get_masks(tokens, max_seq_length)
    input_segments = get_segments(tokens, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    return pool_embs

In [69]:
test_title = 'Cat ran away. How do I find it?'

In [73]:
test_embedding=embed_input(test_title, model, max_seq_length)
test_embedding.shape

(1, 768)