In [1]:
import pathlib
import json
from bs4 import BeautifulSoup
from openai import OpenAI

In [2]:
NBS_DIR = pathlib.Path().resolve().parent
BASE_DIR = NBS_DIR
DATASET_DIR = BASE_DIR / "dataset"

In [3]:
def get_openai_client():
    return OpenAI(
        base_url= 'http://localhost:11434/v1',
        api_key= 'ollama', # required, but unused
    )

In [4]:
client = get_openai_client()

In [8]:
def extract_summary_and_keywords(content="", client=None, raw=None):
    if not isinstance(client, OpenAI):
        client = get_openai_client()
    system_prompt = "".join([
        "You are an expert web scraper and researcher.",
        "When you get data, you perform expert-level summarization and keyword extraction.",
    ])
    prompt_start = "".join([
        "Extract a 1-word subject of the text as the top ranked keyword.",
        "Extract and rank top keywords based on the subject matter of only of the text.",
        "Rank each keyword based on the keyword's importance to the subject matter of the text.",
        "Provide a concise summary of the contents of the text",
        "The summary should not include anything related to the discussion nature of the text.",
        "The summary should not include anything related to the conversation nature of the text.",
        "The summary should be a minimum 3 paragraphs.",
        "Use the following text: "
    ])
    prompt_end="Using format of \"{'summary': <generated-summary>, 'keywords': [{value: 'a', rank: 1}, {value: 'b', rank: 2}, {value: 'c', rank: 3}, {value: 'd', rank: 4}, {value: 'e', rank: 5}]}\" return a response with json"
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user", 
            "content": f"{prompt_start} {content} {prompt_end}",
        }
    ]
    response = client.chat.completions.create(
      model="llama2",
      messages=messages,
     response_format={ "type" : "json_object" }
    )
    if raw:
        return response
    try:
        return json.loads(response.choices[0].message.content), True
    except:
        return response.choices[0].message.content, False

In [9]:
extract_summary_and_keywords("What is the goal of making code-based projects in Python?")

({'summary': 'The goal of making code-based projects in Python is to create software applications that can perform specific tasks or solve problems. This involves writing and organizing code in a logical and efficient manner, using various tools and libraries to streamline the development process. The end result is a functional program that can be used to automate processes, analyze data, or provide other services.',
  'keywords': [{'value': 'applications', 'rank': 1},
   {'value': 'development', 'rank': 2},
   {'value': 'code', 'rank': 3},
   {'value': 'organization', 'rank': 4},
   {'value': 'efficiency', 'rank': 5}]},
 True)

In [10]:
extract_summary_and_keywords("What is the value of web scraping data?", client=client)

({'summary': 'Web scraping data can provide valuable insights and information that can be used to inform business decisions. The data collected through web scraping can be analyzed and used to identify patterns, trends, and relationships that may not be apparent through other means. Additionally, web scraping can help organizations save time and resources by automating the process of collecting and analyzing data, rather than relying on manual methods.',
  'keywords': [{'value': 'data', 'rank': 1},
   {'value': 'insights', 'rank': 2},
   {'value': 'information', 'rank': 3},
   {'value': 'patterns', 'rank': 4},
   {'value': 'trends', 'rank': 5},
   {'value': 'competitive advantage', 'rank': 6}]},
 True)

In [11]:
extract_summary_and_keywords("What are the top key areas of learning to code and learning to web scrape?", client=client)

({'summary': 'Learning to code and web scraping involve mastering various key areas. For coding, the most important areas include problem-solving, algorithmic thinking, and programming languages. In web scraping, the primary areas are data extraction, data manipulation, and web navigation. Understanding these key areas is crucial for success in both fields.',
  'keywords': [{'value': 'coding', 'rank': 1},
   {'value': 'web scrape', 'rank': 2},
   {'value': 'problem-solving', 'rank': 3},
   {'value': 'algorithmic thinking', 'rank': 4},
   {'value': 'programming languages', 'rank': 5}]},
 True)

In [12]:
for path in list(DATASET_DIR.glob("**/**/thread.html"))[:5]:
    post_id = path.parent.name
    output_path = path.parent / 'pred.json'
    output_path_txt = path.parent / 'pred.txt'
    detail_path = path.parent / 'detail.json'
    post_detail = json.loads(detail_path.read_text())
    print(path)
    print(post_detail)
    # if output_path.exists():
    #     continue
    soup = BeautifulSoup(path.read_text(), 'html.parser')
    body = soup.find('body')
    # parse the scraped data or scrape more
    content = body.get_text()
    content = content.replace('new | past | comments | ask | show | jobs | submit', '')
    content = content.replace('login', '').replace('Hacker News', '')
    content = content.replace('| hide | past | favorite |', '')
    content = content.replace('| parent', '')
    content = content.replace('| next [–] ', '')
    # print(content)
    content = content.strip()
    # print(content)
    try:
        pred_data, is_json = extract_summary_and_keywords(content, client=client)
        if is_json:
            pred_data = json.dumps(pred_data, indent=4)
            output_path.write_text(pred_data)
        else:
            output_path_txt.write_text(pred_data)
    except:
        continue
    print(post_detail, pred_data)

/Users/acamara/Documents/Dev/CFE_Project/SmartWebScrappingWithPythonAI/dataset/2024-02-13/posts/39356920/thread.html
{'id': '39356920', 'text': 'Is something bugging you? (antithesis.com)', 'target_link': 'https://antithesis.com/blog/is_something_bugging_you/', 'score': '1179', 'thread_link': 'https://news.ycombinator.com/item?id=39356920'}
{'id': '39356920', 'text': 'Is something bugging you? (antithesis.com)', 'target_link': 'https://antithesis.com/blog/is_something_bugging_you/', 'score': '1179', 'thread_link': 'https://news.ycombinator.com/item?id=39356920'} {
    "summary": "Antithesis is a fuzzing platform for software developers.",
    "keywords": [
        {
            "value": "fuzzing",
            "rank": 1
        },
        {
            "value": "software development",
            "rank": 2
        },
        {
            "value": "testing",
            "rank": 3
        },
        {
            "value": "security",
            "rank": 4
        },
        {
           