# Imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

import tqdm
tqdm.tqdm.pandas()

# Pull posts data from snowflake

In [4]:
from utils.get_data import get_posts

In [5]:
query = """
    SELECT distinct post_content || 'post_id: ' || post_id as post_content
        , post_id || '_' || site_prefix post_id
    FROM prod."posts"
    WHERE post_status = 'publish' and post_type = 'post'
    and site_prefix = 'WP'
    ORDER BY POST_ID DESC
    LIMIT 20000
"""

In [6]:
posts_df = get_posts(query)

In [7]:
posts_df

Unnamed: 0,POST_CONTENT,POST_ID
0,"As a big sister to four very sweet, hilarious,...",99918_WP
1,I spent my childhood growing up on a farm/ranc...,99917_WP
2,When am I going to get better at managing the ...,99841_WP
3,I know I am not alone in struggling to find a ...,99838_WP
4,"If you:\r\n<ul>\r\n \t<li>are new to migraine,...",99836_WP
...,...,...
3467,We asked our contributors a series of question...,100098_WP
3468,I feel as though I am always walking a fine li...,100073_WP
3469,"I have used all <a href=""/migraine-treatment/t...",100062_WP
3470,and flooded a shared back hallway between a re...,100037_WP


# KeyBERT

In [8]:
from keybert import KeyBERT

In [9]:
kw_extraction_model = KeyBERT()

In [10]:
kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[3],
                                     keyphrase_ngram_range=(1, 3), stop_words='english', top_n = 20)

[('href blog migraine', 0.6577),
 ('migraine specialists href', 0.6382),
 ('exercise migraine trigger', 0.5994),
 ('exercise migraine', 0.5611),
 ('deal exercise migraine', 0.5499),
 ('href blog exercise', 0.5102),
 ('blog migraine triggers', 0.4967),
 ('blog migraine', 0.4926),
 ('pose migraine com', 0.4917),
 ('migraine trigger trigger', 0.4848),
 ('migraine trigger', 0.4786),
 ('act doing href', 0.4719),
 ('exacerbate migraine pain', 0.4694),
 ('migraine triggers ways', 0.4583),
 ('migraine com', 0.4522),
 ('like pose migraine', 0.4519),
 ('doing href blog', 0.4517),
 ('exacerbate migraine', 0.4493),
 ('pose migraine', 0.4472),
 ('doing href', 0.4414)]

In [11]:
kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[3],
                                     keyphrase_ngram_range=(1, 3), stop_words='english',  use_maxsum=True, nr_candidates=20, top_n=5)

[('doing href blog', -0.1063),
 ('like pose migraine', 0.0503),
 ('act doing href', -0.0075),
 ('migraine trigger trigger', 0.0559),
 ('deal exercise migraine', -0.0441)]

In [12]:
kw_extraction_model.extract_keywords(posts_df.POST_CONTENT[3],
                                     keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=True, diversity=0.5, top_n = 20 )

[('href blog migraine', 0.6577),
 ('ways deal exercise', 0.3324),
 ('unique path', 0.3534),
 ('triggers ways', 0.2805),
 ('exercise migraine trigger', 0.5994),
 ('script script src', 0.2188),
 ('heart rate research', 0.2433),
 ('act doing href', 0.4719),
 ('target _blank essentrics', 0.2073),
 ('position absolute width', 0.0923),
 ('fast wistia', 0.0894),
 ('series add comment', 0.065),
 ('share secrets resources', 0.115),
 ('shortcode', 0.1378),
 ('blog exercise', 0.3286),
 ('does exacerbate', 0.1971),
 ('pain', 0.1904),
 ('fits h2 comes', 0.0958),
 ('sign emails', 0.0481),
 ('flare ups act', 0.2083)]

# API call

In [13]:
import requests
import json

In [14]:
payload = {
    'keywords_number':5,
    'post_ids': json.dumps(posts_df.head(5).POST_ID.tolist()),
    'posts_content': json.dumps(posts_df.head(5).POST_CONTENT.tolist())
}

In [15]:
%%time
response = requests.get('http://192.168.1.9:8000/api/autokeyword', params = payload)

CPU times: user 33.3 ms, sys: 15.2 ms, total: 48.5 ms
Wall time: 8.79 s


In [11]:
json.loads(response.text)["keywords"]

[['siblings kids migraine',
  'facilitate exciting environments',
  'basketball game season',
  'anxious taking brother',
  'triggers present volunteered'],
 ['dealing lame animals',
  'granddaughter rancher',
  'different buying meat',
  'sustain family',
  'pasture simply live'],
 ['guilt accompanies migraine',
  'health wellness life',
  'successful coping mechanism',
  'stopped feeling victimized',
  'migraine lifetime dealing'],
 ['decrease migraine flare',
  'struggling way exercise',
  'video demonstrate essentrics',
  'share secrets resources',
  'fits allwhen comes'],
 ['triggers migraineurs diary',
  'detective trying solve',
  'uncovering patterns pain',
  'journal great way',
  'causes attacks time']]

In [12]:
json.loads(response.text)["post_ids"]

['99918_WP', '99917_WP', '99841_WP', '99838_WP', '99836_WP']