In [1]:
import os

import requests

from bs4 import BeautifulSoup
from itertools import count

import json
from pathlib import Path
from functools import lru_cache

from dataclasses import dataclass

from IPython.display import display, HTML

In [2]:
@dataclass(eq=True, frozen=True, kw_only=True)
class Comment():
    comment: str
    author: str
    id_: str
        
@dataclass(eq=True, frozen=True, kw_only=True)
class CommentWithSummary():
    c: Comment
    summary: str

In [3]:
def top_level_comments(page):
    def is_top_level(el):
        if 'noshow' in el.attrs['class']:
            return False
        
        return el.parent.find('a', class_='clicky').text == 'prev'
    
    def make_comment(el):
        author = el.parent.find('a', class_='hnuser').text
        id_ = int(el.parent.find('span').find(
            lambda x: x.has_attr('id') and x['id'].startswith('unv_'))['id'][4:])

        return Comment(comment=el.find('span').text, author=author, id_=id_)
    
    response = requests.get(page)
    if not response.ok:
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    return [make_comment(el) for el in soup.find_all('div', class_='comment') if is_top_level(el)]

In [4]:
all_comments = []

for p in count(start=1): 
    all_comments.extend(more := top_level_comments(f"https://news.ycombinator.com/item?id=35773707&p={p}"))
    if not more:
        break

In [5]:
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

@lru_cache(maxsize=None)
def get_completion(instruction):
    return openai.Completion.create(
        model="text-davinci-003",
        prompt=instruction + "\n",
        temperature=0,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )["choices"][0]["text"].strip()

In [6]:
data = []

for c in all_comments:        
    summary = get_completion(f"""
Summarise key techologies & roles from the following comment, return a valid json-string:

Example:
OKRA.ai | Full-time roles Onsite (hybrid)| https://okra.ai/We build products that leverage AI from various technologies to provide insights to Healthcare experts to help them make better decisions for the best.We’re currently looking for our next colleagues:- Medior Data Engineer | Hybrid Leiden (NL) or Cambridge (UK) | Python, Airflow, Flask/FastAPI | https://rb.gy/h98uk- Medior Front-End Engineer | Hybrid - Leiden (NL) or Cambridge (UK) | React, Node, D3.js | https://rb.gy/4t7o0- Senior Machine Learning Data Scientist | Hybrid - Leiden (NL) | Python, XGBoost, CNN, Scikit-learn | https://rb.gy/8bkgo- Medior/Senior NLP Data Scientist | Hybrid - Leiden (NL) or Cambridge (UK) | Python, Transformers, CNN | https://rb.gy/9gyuiNice tech stack, highly fulfilling job, international team with a lot of space for growth!Feel free to check the links for the job desc and apply directly or contact our HR team at hr@okra.ai for any questions.

Sould respond:
{{"roles": ["data engineer", "front-end engineer", "data scientist"], "techologies": ["python", "airflow", "flask", "fastapi", "react", "node", "xgboost", "cnn", "scikit-learn", "transformers", "ccn"]}}

Return a valid json string now with "roles" and "techologies" as top-level fields in the json:
{c.comment}
""")
    data.append(CommentWithSummary(c=c.__dict__, summary=summary).__dict__) 

In [9]:
decoded = []

for c in data:
    decoded.append(json.loads(c['summary']))

In [15]:
import pandas as pd

In [32]:
pd.Series([t.lower() for d in decoded for t in d['techologies']]).value_counts()[:30]

python              55
typescript          42
react               40
aws                 29
rust                19
postgres            17
go                  16
kubernetes          14
java                13
javascript          13
postgresql          13
node                13
django              12
terraform           11
graphql             10
redis               10
swift               10
ai                   9
ruby                 9
kotlin               8
docker               8
devops               8
rails                7
machine learning     7
golang               7
gcp                  7
linux                7
backend              6
ruby on rails        6
c++                  6
dtype: int64

In [31]:
df['total_bill']

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [34]:
pd.Series([r.lower() for d in decoded for r in d['roles']]).value_counts()[:10]

software engineer           27
senior software engineer    15
backend engineer            13
product manager             12
data engineer               10
full-stack engineer         10
front-end engineer           8
full stack engineer          7
android engineer             7
frontend engineer            7
dtype: int64