In [None]:
import pathlib
import requests
import urllib.parse
import json
from openai import OpenAI

In [None]:
NBS_DIR = NBS_DIR = pathlib.Path().resolve().parent
BASE_DIR = NBS_DIR
DATASET_DIR = BASE_DIR / "dataset"

In [None]:
def url_encode(params):
    return urllib.parse.urlencode(params, quote_via=urllib.parse.quote_plus)

def get_url(search_term="Python programming"):
    params = {
        'lang': 'en_us',
        'media': 'podcast',
        'entity': 'podcastEpisode',
        'limit': 10,
        'term': search_term
    }
    encoded_params = url_encode(params)
    return f"https://itunes.apple.com/search?{encoded_params}"

In [None]:
url = get_url(search_term="systemd")
url

In [None]:
import requests

r = requests.get(url, headers={"Content-Type": "application/json"})

data = r.json()

results = data.get('results')

results = sorted(results, key=lambda x: x['releaseDate'], reverse=True)

for idx, result in enumerate(results):
    kind = result.get('kind')
    if kind != "podcast-episode":
        continue
    releaseDate = result.get('releaseDate')
    podcastName = result['collectionName']
    title = result['trackName']
    episodeUrl = result['episodeUrl']
    print(idx+1, title, podcastName, releaseDate, episodeUrl)
    print("\n")

In [None]:
def get_openai_client():
    return OpenAI(
        base_url = 'http://localhost:11434/v1',
        api_key='ollama', # required, but unused
    )

In [None]:
def guess_language(content="", client=None, raw=None):
    if not isinstance(client, OpenAI):
        client = get_openai_client()
    system_prompt = "".join([
        "You are an expert at deciphering the type of language of text.",
    ])
    prompt_start = "".join([
        "Respond only with your best guess of what the language is of the input text. Use real human languages.",
        "Use the following:"
    ])
    prompt_end="Using format of \"{'language': <generated-answer>}\" return a response with json"
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user", 
            "content": f"{prompt_start} {content} {prompt_end}",
        }
    ]
    response = client.chat.completions.create(
        model="llama2",
        messages=messages,
        response_format={ "type" : "json_object" }
    )
    if raw:
        return response
    try:
        return json.loads(response.choices[0].message.content), True
    except:
        return response.choices[0].message.content, False

In [None]:
pred, is_json = guess_language("Binärgewitter Talk #320: Für die Liebe zu systemd Binärgewitter ")
if is_json:
    print('language', pred.get('language'))

In [None]:
url = get_url(search_term="Python AI and ML")
r = requests.get(url, headers={"Content-Type": "application/json"})

data = r.json()

results = data.get('results')

results = sorted(results, key=lambda x: x['releaseDate'], reverse=True)
ignore_langs = [x.lower() for x in ['German', 'Russian', 'Japanese', 'Chinese', "Spanish"]]

for idx, result in enumerate(results):
    # print(result)
    kind = result.get('kind')
    if kind != "podcast-episode":
        continue
    releaseDate = result.get('releaseDate')
    podcastName = result['collectionName']
    title = result['trackName']
    pred_lang, is_json = guess_language(title)
    lang = None
    if is_json:
        lang = pred_lang.get("language")
    if f"{lang}".lower() in ignore_langs:
        continue
    episodeUrl = result['episodeUrl']
    print(idx+1, lang, title, podcastName, releaseDate, episodeUrl)