In [None]:
import pathlib
import json
from bs4 import BeautifulSoup
from openai import OpenAI

In [None]:
NBS_DIR = pathlib.Path().resolve().parent
BASE_DIR = NBS_DIR
DATASET_DIR = BASE_DIR / "dataset"

In [None]:
def get_openai_client():
    return OpenAI(
        base_url= 'http://localhost:11434/v1',
        api_key= 'ollama', # required, but unused
    )

In [None]:
client = get_openai_client()

In [None]:
def extract_summary_and_keywords(content="", client=None, raw=None):
    if not isinstance(client, OpenAI):
        client = get_openai_client()
    system_prompt = "".join([
        "You are an expert web scraper and researcher.",
        "When you get data, you perform expert-level summarization and keyword extraction.",
    ])
    prompt_start = "".join([
        "Extract a 1-word subject of the text as the top ranked keyword.",
        "Extract and rank top keywords based on the subject matter of only of the text.",
        "Rank each keyword based on the keyword's importance to the subject matter of the text.",
        "Provide a concise summary of the contents of the text",
        "The summary should not include anything related to the discussion nature of the text.",
        "The summary should not include anything related to the conversation nature of the text.",
        "The summary should be a minimum 3 paragraphs.",
        "Use the following text: "
    ])
    prompt_end="Using format of \"{'summary': <generated-summary>, 'keywords': [{value: 'a', rank: 1}, {value: 'b', rank: 2}, {value: 'c', rank: 3}, {value: 'd', rank: 4}, {value: 'e', rank: 5}]}\" return a response with json"
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user", 
            "content": f"{prompt_start} {content} {prompt_end}",
        }
    ]
    response = client.chat.completions.create(
      model="llama2",
      messages=messages,
     response_format={ "type" : "json_object" }
    )
    if raw:
        return response
    try:
        return json.loads(response.choices[0].message.content), True
    except:
        return response.choices[0].message.content, False

In [None]:
extract_summary_and_keywords("What is the goal of making code-based projects in Python?")

In [None]:
extract_summary_and_keywords("What is the value of web scraping data?", client=client)

In [None]:
extract_summary_and_keywords("What are the top key areas of learning to code and learning to web scrape?", client=client)

In [None]:
for path in list(DATASET_DIR.glob("**/**/thread.html"))[:5]:
    post_id = path.parent.name
    output_path = path.parent / 'pred.json'
    output_path_txt = path.parent / 'pred.txt'
    detail_path = path.parent / 'detail.json'
    post_detail = json.loads(detail_path.read_text())
    # print(path)
    print(post_detail)
    # if output_path.exists():
    #     continue
    soup = BeautifulSoup(path.read_text(), 'html.parser')
    body = soup.find('body')
    # parse the scraped data or scrape more
    content = body.get_text()
    content = content.replace('new | past | comments | ask | show | jobs | submit', '')
    content = content.replace('login', '').replace('Hacker News', '')
    content = content.replace('| hide | past | favorite |', '')
    content = content.replace('| parent', '')
    content = content.replace('| next [–] ', '')
    # print(content)
    content = content.strip()
    # print(content)
    try:
        pred_data, is_json = extract_summary_and_keywords(content, client=client)
        if is_json:
            pred_data = json.dumps(pred_data, indent=4)
            output_path.write_text(pred_data)
        else:
            output_path_txt.write_text(pred_data)
    except:
        continue
    print(post_detail, pred_data)