# Imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

import tqdm
tqdm.tqdm.pandas()

# Pull posts data from snowflake

In [4]:
from utils.get_data import get_posts

In [5]:
query = """
    SELECT distinct post_content || 'post_id: ' || post_id as post_content
        , post_id || '_' || site_prefix post_id
    FROM prod."posts"
    WHERE post_status = 'publish' and post_type = 'post'
    and site_prefix = 'WP'
    ORDER BY POST_ID DESC
    LIMIT 20000
"""

In [6]:
posts_df = get_posts(query)

In [7]:
posts_df

Unnamed: 0,POST_CONTENT,POST_ID
0,"As a big sister to four very sweet, hilarious,...",99918_WP
1,I spent my childhood growing up on a farm/ranc...,99917_WP
2,When am I going to get better at managing the ...,99841_WP
3,I know I am not alone in struggling to find a ...,99838_WP
4,"If you:\r\n<ul>\r\n \t<li>are new to migraine,...",99836_WP
...,...,...
3468,We asked our contributors a series of question...,100098_WP
3469,I feel as though I am always walking a fine li...,100073_WP
3470,"I have used all <a href=""/migraine-treatment/t...",100062_WP
3471,and flooded a shared back hallway between a re...,100037_WP


# Cleaning Post Text

In [8]:
from utils.clean_text import remove_html, remove_between_square_brackets, remove_post_id, remove_backslash_symbols, remove_links

In [9]:
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_html(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_between_square_brackets(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_post_id(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_backslash_symbols(x))
posts_df.POST_CONTENT = posts_df.POST_CONTENT.progress_apply(lambda x: remove_links(x))

100%|██████████| 3473/3473 [00:03<00:00, 1002.25it/s]
100%|██████████| 3473/3473 [00:00<00:00, 111135.15it/s]
100%|██████████| 3473/3473 [00:00<00:00, 100346.62it/s]
100%|██████████| 3473/3473 [00:00<00:00, 63186.74it/s]
100%|██████████| 3473/3473 [00:00<00:00, 106396.26it/s]


# KeyBERT

In [10]:
from keybert import KeyBERT

In [11]:
kw_extraction_model = KeyBERT()

In [12]:
kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[706],
                                     keyphrase_ngram_range=(1, 3), stop_words='english', top_n = 20)

[('friends wedding', 0.6171),
 ('family wedding', 0.6143),
 ('wedding', 0.6008),
 ('quirky love weddings', 0.5975),
 ('family wedding pittsburgh', 0.5806),
 ('wedding trying good', 0.5768),
 ('present friends wedding', 0.5756),
 ('wedding pittsburgh', 0.5654),
 ('wedding trying', 0.5639),
 ('road family wedding', 0.5594),
 ('wedding yesterday evening', 0.5592),
 ('friends wedding vows', 0.5548),
 ('wedding yesterday', 0.5526),
 ('weddings', 0.5516),
 ('wedding hour outside', 0.5505),
 ('wedding pittsburgh ll', 0.5286),
 ('wedding hour', 0.5285),
 ('weddings really couple', 0.5281),
 ('love weddings', 0.5277),
 ('arrive wedding', 0.5205)]

In [13]:
kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[706],
                                     keyphrase_ngram_range=(1, 3), stop_words='english',  use_maxsum=True, nr_candidates=20, top_n=5)

[('wedding pittsburgh ll', 0.0817),
 ('wedding hour outside', 0.084),
 ('road family wedding', -0.0621),
 ('present friends wedding', 0.1843),
 ('wedding trying good', 0.0202)]

In [14]:
kw_extracted = kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[97],
                                     keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=True, diversity=0.5, top_n = 20)
kw_extracted.sort(key=lambda x:x[1], reverse=True)

for i in kw_extracted:
    print(i)

('trigger migraines dog', 0.5954)
('adopting dog horrible', 0.528)
('idea getting dogs', 0.5079)
('dog mean stress', 0.4228)
('care dog', 0.4041)
('hankering dog asked', 0.3667)
('elderly cats great', 0.3545)
('advice heart thought', 0.2985)
('gamble said dog', 0.2729)
('feeling family missing', 0.2705)
('need extra calming', 0.2453)
('does bark pull', 0.2334)
('behavioral issues thought', 0.2228)
('bed struggling severe', 0.2097)
('severe nausea vomiting', 0.183)
('laugh smile pain', 0.1588)
('great joy wellness', 0.1531)
('sidelined career pain', 0.1412)
('fence outside spend', 0.095)
('dice years consideration', 0.0217)


# API call

In [8]:
import requests
import json

In [9]:
payload = {
  "keywords_number": 2,
  "post_ids": posts_df.head(2).POST_ID.tolist(),
  "posts_content": posts_df.head(2).POST_CONTENT.tolist()
}

In [16]:
%%time
response = requests.post('http://127.0.0.1:8000/api/autokeyword', json = payload)
response

CPU times: user 5.85 ms, sys: 3.69 ms, total: 9.54 ms
Wall time: 3.88 s


<Response [200]>

In [17]:
json.loads(response.text)['keywords']

[['siblings kids migraine', 'facilitate exciting environments'],
 ['animals challenge ranch', 'young age attached']]

In [18]:
json.loads(response.text)["post_ids"]

['99918_WP', '99917_WP']