### EDA on jobtech API
- this script is used to explain how to extract data from jobtech API
- this is not used by the dlt script in the data ingestion process

In [22]:
#  Import libraries and define helper functions
import dlt
import requests
import json
from pathlib import Path
import os

def _get_ads(url_for_search, params):
    headers = {"accept": "application/json"}
    response = requests.get(url_for_search, headers=headers, params=params)
    response.raise_for_status()
    return json.loads(response.content.decode("utf8"))

In [36]:
# Set up test parameters
# Create a dictionary to map field names to IDs for better readability
occupation_fields_dict = {
    "Försäljning, inköp, marknadsföring": "RPTn_bxG_ExZ",
    "Hälso- och sjukvård": "NYW6_mP6_vwf",
    "Hotell, restaurang, storhushåll": "ScKy_FHB_7wT"
}

print("Available occupation fields:")
for name, id in occupation_fields_dict.items():
    print(f"- {name}: {id}")

Available occupation fields:
- Försäljning, inköp, marknadsföring: RPTn_bxG_ExZ
- Hälso- och sjukvård: NYW6_mP6_vwf
- Hotell, restaurang, storhushåll: ScKy_FHB_7wT


In [37]:
# Test _get_ads with parameters
url = 'https://jobsearch.api.jobtechdev.se'
url_for_search = f"{url}/search"

# Select occupation field ID for initial parameters
selected_name = list(occupation_fields_dict.keys())[1]
print(selected_name)
selected_id = occupation_fields_dict[selected_name]

query = ""
params = {"q": query, "limit": 10, "occupation-field": selected_id}

response_data = _get_ads(url_for_search, params)
print(f"Total hits found: {response_data['total']['value']}")
print(f"Number of hits returned: {len(response_data['hits'])}")
print("\nFirst result headline:")
if response_data['hits']:
    print(
        f"{response_data['hits'][0]['headline']}, {response_data['hits'][0]['employer']['name']}"
        )

Total hits found: 4941
Number of hits returned: 10

First result headline:
Farmaceut till Apotek Hjärtat, ICA Maxi Linköping!, Apotek Hjärtat AB


In [39]:
# Function to search through hits for a specific occupation field
def search_by_occupation_field(occupation_field, limit=10):
    search_params = {
        'q': '',
        'limit': limit,
        'occupation-field': occupation_field
    }
    json_response = _get_ads(url_for_search, search_params)
    total_hits = json_response['total']['value']
    hits = json_response['hits']
    
    print(f"\nOccupation field: {occupation_field}")
    print(f"Total number of hits: {total_hits}")
    print(f"Number of hits returned: {len(hits)}")
    
    print("\nTop results:")
    for i, hit in enumerate(hits[:5], 1):  # Show first 5 results
        print(f"{i}. {hit['headline']} - {hit['employer']['name']}")
    
    return json_response

In [40]:
# Test with different occupation fields
results = {}

# Loop through occupation field dictionary for more readable output
for field_name, field_id in occupation_fields_dict.items():
    print("\n" + "="*50)
    print(f"Testing field: {field_name} ({field_id})")
    results[field_id] = search_by_occupation_field(field_id)
    print("="*50)


Testing field: Försäljning, inköp, marknadsföring (RPTn_bxG_ExZ)

Occupation field: RPTn_bxG_ExZ
Total number of hits: 3823
Number of hits returned: 10

Top results:
1. Hertz biluthyrning i Arvidsjaur söker ny medarbetare! - Bilbolaget Nord AB
2. Key Account Manager till NJIE i Göteborg - Your Talent AB
3. Skaderådgivare - Hedin Automotive Göteborg AB
4. Butiksmedarbetare (extrajobb), Willys Luleå Hamn - Willy:S AB
5. Circle K Veddesta söker butikssäljare för heltid - Circle K Sverige AB

Testing field: Hälso- och sjukvård (NYW6_mP6_vwf)

Occupation field: NYW6_mP6_vwf
Total number of hits: 4947
Number of hits returned: 10

Top results:
1. Sjuksköterska till Infektionskliniken, Sundsvall - REGION VÄSTERNORRLAND
2. Specialistläkare inom Onkologiskt Centrum, Skaraborgs sjukhus - VÄSTRA GÖTALANDSREGIONEN
3. Fysioterapeut till centrala Göteborg - GÖTEBORGS KOMMUN
4. Sektionschef till sektionen för molekylär bilddiagnostik  - REGION UPPSALA
5. Sektionschef till sektionen för muskuloskeleta

In [41]:
# Display job ad keys in a more readable format
import pprint

# Get the first field ID to access results
first_field_id = occupation_fields_dict[list(occupation_fields_dict.keys())[0]]

if results and first_field_id in results and results[first_field_id]['hits']:
    sample_ad = results[first_field_id]['hits'][0]
    print("Sample job ad structure (formatted):")
    print("\nKeys in job ad:")
    pprint.pprint(list(sample_ad.keys()), width=60, compact=False)
    
    # Alternative display method
    print("\nMethod 2 - Display keys in multiple lines:")
    for i, key in enumerate(sample_ad.keys(), 1):
        print(f"{i:2d}. {key}")
    

Sample job ad structure (formatted):

Keys in job ad:
['relevance',
 'id',
 'external_id',
 'original_id',
 'label',
 'webpage_url',
 'logo_url',
 'headline',
 'application_deadline',
 'number_of_vacancies',
 'description',
 'employment_type',
 'salary_type',
 'salary_description',
 'duration',
 'working_hours_type',
 'scope_of_work',
 'access',
 'employer',
 'application_details',
 'experience_required',
 'access_to_own_car',
 'driving_license_required',
 'driving_license',
 'occupation',
 'occupation_group',
 'occupation_field',
 'workplace_address',
 'must_have',
 'nice_to_have',
 'application_contacts',
 'publication_date',
 'last_publication_date',
 'removed',
 'removed_date',
 'source_type',
 'timestamp']

Method 2 - Display keys in multiple lines:
 1. relevance
 2. id
 3. external_id
 4. original_id
 5. label
 6. webpage_url
 7. logo_url
 8. headline
 9. application_deadline
10. number_of_vacancies
11. description
12. employment_type
13. salary_type
14. salary_description
15. du

In [42]:
# Explore a specific job ad in more detail
# Get the first field ID to access results
first_field_id = occupation_fields_dict[list(occupation_fields_dict.keys())[0]]

if results and first_field_id in results and results[first_field_id]['hits']:
    sample_ad = results[first_field_id]['hits'][0]
    
    print("Detailed job ad information:\n")
    
    # Basic information
    print(f"ID: {sample_ad.get('id', 'N/A')}")
    print(f"Headline: {sample_ad.get('headline', 'N/A')}")
    print(f"Employer: {sample_ad.get('employer', {}).get('name', 'N/A')}")
    print(f"Publication date: {sample_ad.get('publication_date', 'N/A')}")
    print(f"Last publication date: {sample_ad.get('last_publication_date', 'N/A')}")
    print(f"Application deadline: {sample_ad.get('application_deadline', 'N/A')}")
    
    # Description (limited to first 300 chars)
    description = sample_ad.get('description', {}).get('text', 'N/A')
    if description != 'N/A':
        print(f"\nDescription (truncated): {description[:300]}...")
    
    # Location
    print("\nLocation:")
    workplace = sample_ad.get('workplace_address', {})
    for key, value in workplace.items():
        if value:
            print(f"  {key}: {value}")
    
    # Must have/Nice to have skills
    print("\nRequired skills:")
    for skill in sample_ad.get('must_have', {}).get('skills', []):
        print(f"  - {skill.get('term', 'N/A')}")
    
    print("\nNice to have skills:")
    for skill in sample_ad.get('nice_to_have', {}).get('skills', []):
        print(f"  - {skill.get('term', 'N/A')}")
    
    # Employment details
    print("\nEmployment details:")
    print(f"  Type: {sample_ad.get('employment_type', {}).get('label', 'N/A')}")
    print(f"  Duration: {sample_ad.get('duration', {}).get('label', 'N/A')}")
    print(f"  Working hours: {sample_ad.get('working_hours_type', {}).get('label', 'N/A')}")
    print(f"  Scope of work: {sample_ad.get('scope_of_work', {}).get('label', 'N/A')}")
    
    # Application details
    print("\nApplication details:")
    app_details = sample_ad.get('application_details', {})
    for key, value in app_details.items():
        if value and key != 'reference':
            print(f"  {key}: {value}")
    
    # Application contacts
    print("\nContacts:")
    for contact in sample_ad.get('application_contacts', []):
        print(f"  Name: {contact.get('name', 'N/A')}")
        print(f"  Title: {contact.get('title', 'N/A')}")
        if contact.get('phone_number'):
            print(f"  Phone: {contact.get('phone_number')}")
        if contact.get('email'):
            print(f"  Email: {contact.get('email')}")
        print("")

Detailed job ad information:

ID: 29975369
Headline: Hertz biluthyrning i Arvidsjaur söker ny medarbetare!
Employer: Bilbolaget Nord AB
Publication date: 2025-09-02T08:10:46
Last publication date: 2025-09-24T23:59:59
Application deadline: 2025-09-24T23:59:59

Description (truncated): Bilbolaget har funnits sedan 1932 och verkar inom en rad olika områden inom bilägande. Vi är Norrlands ledande återförsäljare av Volvo, Renault, Dacia, Ford samt begagnat och säljer totalt 9 000 bilar per år. Vi säljer personbilar och transportbilar, har hyrbilsverksamhet, verkstad, däckhotell, erbj...

Location:
  municipality: Arvidsjaur
  municipality_code: 2505
  municipality_concept_id: A5WX_XVo_Zt6
  region: Norrbottens län
  region_code: 25
  region_concept_id: 9hXe_F4g_eTG
  country: Sverige
  country_code: 199
  country_concept_id: i46j_HmG_v64
  coordinates: [19.180283, 65.59208]

Required skills:

Nice to have skills:

Employment details:
  Type: Vanlig anställning
  Duration: Tills vidare
  Wor