In [None]:
import json
from collections import Counter, defaultdict
from tabulate import tabulate
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import matplotlib.dates as mdates

data = None
with open('./data/job_offers.json', 'r', encoding="utf-8") as f:
    text = f.read()
    data = json.loads(text)
# print(json.dumps(data[0], indent=4, ensure_ascii=False))

musts = []
nices = []
langs = []
for item in data:
    musts.extend(item["requirements"]["musts"])
    nices.extend(item["requirements"]["nices"])
    langs.extend(item["requirements"]["languages"])
print(f"Offers: {len(data)}\nMust have requirements: {len(musts)}\nNice to have requirements {len(nices)}")

In [None]:
musts[162]

In [None]:
nices[235]

In [None]:
langs[0]

In [None]:
data[423]["essentials"]

**Salaries**

In [None]:
def get_avg_salaries(item):
    currency = item["essentials"]["salary"]["currency"]
    if currency != "PLN":
        return []
    types = item["essentials"]["salary"]["types"]
    salaries = []
    for contract in ["permanent", "b2b"]:
        if contract not in types:
            continue
        if types[contract]["period"] != "Month":
            continue
        range = types[contract]["range"]
        if len(range) != 2:
            continue
        salaries.append({"contract": contract, "currency":  currency, "avg": sum(map(int, range)) / len(range)})
    return salaries


def plot_against_salary(data, title, box_contract = None):
    if box_contract not in [None, "permanent", "b2b"]:
        raise ValueError("Invalid box_contract value")
    
    grouped = defaultdict(lambda: [])
    for item in data:
        attr = "Remote" if item["location"]["remote"] else "Non-remote"
        salaries = get_avg_salaries(item)
        grouped[attr].extend(salaries)

    fig, axes = plt.subplots(nrows=len(grouped), ncols=1, figsize=(10, 6.5))
    fig.subplots_adjust(hspace=0.4)

    fig.suptitle(title)
    for [ax, [key, group]] in zip(axes, grouped.items()):
        permanent_means = [salary["avg"] for salary in group if salary["contract"] == "permanent"]
        b2b_means = [salary["avg"] for salary in group if salary["contract"] == "b2b"]
        labels = ["permanent", "b2b"]
        colors = ["C0", "C1"]
        for i, means in enumerate([permanent_means, b2b_means]): 
            print(f"Plotting {len(means)} {key} {labels[i]} records.")
            if box_contract:
                if labels[i] != box_contract:
                    continue
                ax.boxplot(means, vert=False)
                ax.get_yaxis().set_visible(False)
                ax.set_title(f"{key} {box_contract}")
            else:
                counts, bins = np.histogram(means, 20)
                ax.stairs(counts, bins, label=labels[i], color=colors[i])
                ax.axvline(np.mean(means), linestyle='dashed', color=colors[i], linewidth=1, label=f"{labels[i]} mean")
                ax.set_ylabel("Number of occurances")
                ax.set_title(key)
                ax.legend()

        ax.set_xlim(-1, 55_000)
        ax.set_xlabel("Range average salary [PLN]")


plot_against_salary(data, title="Salary vs remote work")
# plot_against_salary(data, box_contract="b2b", title="B2b salary vs remote work")

**Requirements**

In [None]:
def plot_reqs(reqs, title, col="plasma"):
    counter = Counter([m["value"] for m in reqs])
    names = sorted(list(counter), key=lambda x: -counter[x])[:50]
    names = list(reversed(names))
    values = [counter[name] for name in names]

    if col == "plasma":
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.plasma(gradient)
    else:
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.viridis(gradient)
        
    plt.figure(figsize=(12, len(names) // 4))
    bars = plt.barh(names, values, color=colors)

    # Adding values on bars
    for bar, value in zip(bars, values):
        plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{value}',
                 va='center', ha='left')

    plt.margins(y=0.01)
    plt.xlabel('Appearances')
    plt.ylabel('Skill')
    plt.title(title)

plot_reqs(musts, "Must have skills appearances", col="plasma")
plot_reqs(nices, "Nice to have skill appearances", col="viridis")

In [None]:
groups = {
    "programming languages": [
        "Python", "SQL", "Java", "JavaScript", "TypeScript", 
        "C++", "C#", "HTML", "CSS", "Node.js", 
        "Golang", "Scala", "C", "PHP"
    ],
    "frameworks": [
        "React", ".NET", "Spring Boot", "Microservices",
        "TensorFlow", "Angular", "Spring"
    ],
    "cloud technologies": [
        "AWS", "Azure", "GCP", "Azure DevOps"
    ],
    "databases": [
        "PostgreSQL", "MySQL", "Oracle", "NoSQL", "MongoDB", "Redis"
    ],
    "containerization and orchestration": [
        "Docker", "Kubernetes"
    ],
    "configuration management and automation": [
        "Ansible", "Terraform"
    ],
    "ci/cd": [
        "Jenkins", "GitLab", "Azure DevOps", "DevOps", "CI/CD", "CI"
    ],
    "monitoring and logging": [
        "Prometheus", "Grafana", "Elasticsearch"
    ],
    "messaging and streaming": [
        "Kafka", "RabbitMQ"
    ],
    "scripting languages": [
        "Bash", "PowerShell"
    ],
    "data processing": [
        "ETL", "Spark"
    ],
    "other technologies": [
        "Git", "Linux", "SAP", "AI", "GraphQL", "REST API", "REST", "Maven"
    ],
    "other": [
        "Communication skills", "Project management", "Agile", "Degree", "Jira", "Confluence"
    ]
}
def plot_reqs_grouped(reqs, group, title, col="plasma"):
    counter = Counter([m["value"] for m in reqs if m["value"] in group])
    names = sorted(list(counter), key=lambda x: -counter[x])[:50]
    names = list(reversed(names))
    values = [counter[name] for name in names]

    if col == "plasma":
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.plasma(gradient)
    else:
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.viridis(gradient)
        
    plt.figure(figsize=(12, len(group) // 2))
    bars = plt.barh(names, values, color=colors)

    # Adding values on bars
    for bar, value in zip(bars, values):
        plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{value}',
                 va='center', ha='left')

    plt.margins(y=0.01)
    plt.xlabel('Appearances')
    plt.ylabel('Skill')
    plt.title(title)
    
    
for name, skills in groups.items():
    plot_reqs_grouped(musts, skills, f"Must have {name} skills appearances")
    plot_reqs_grouped(nices, skills, f"Nice to have {name} skills appearances", col="viridis")
    

In [None]:
print(len(langs), len(data), f"\nAvg lang per offer: {len(langs) / len(data):.3f}\n")
counter = Counter([lang["code"] for lang in langs])
names = sorted(list(counter), key=lambda x: -counter[x])
levels = ["NA", "A1", "A2", "B1", "B2", "C1", "C2", "NATIVE"]
table = [[level] + [len([lang for lang in langs if lang["code"] == code and lang.get("level", "NA") == level]) for code in names] for level in levels]
table.append(["Total"] + [counter[x] for x in names])
print(tabulate(table, headers=[""] + names))

**Publication dates**

In [None]:
def plot_timestamps(timestamps, bins: int):
    fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(12, 4))
    
    dates = [datetime.fromtimestamp(ts) for ts in timestamps]
    ax1.hist(dates, bins=bins, rwidth=0.9)
    dayFmt = mdates.DateFormatter('%m/%d')
    ax1.xaxis.set_major_formatter(dayFmt)
    ax1.set_xlabel('Publication date [month/day] in 2024')
    ax1.set_ylabel('Number of occurances')
    ax1.tick_params(axis='x', labelrotation=15)
    ax1.set_title('Job offer publication date')
    
    times = [datetime.fromtimestamp(ts % (60*60*24)) for ts in timestamps]
    ax2.hist(times, bins=bins, rwidth=0.9)
    hourFmt = mdates.DateFormatter('%H:%M:%S')
    ax2.xaxis.set_major_formatter(hourFmt)
    ax2.set_xlabel('Publication time')
    ax2.set_ylabel('Number of occurances')
    ax2.tick_params(axis='x', labelrotation=15)
    ax2.set_title('Job offer publication time of day')

times = [item["posted"] // 1000 for item in data]
plot_timestamps(times, 12)

**Map**

In [None]:
from geopy.geocoders import Nominatim
import folium
import os
import pickle
import time

LOC_CACHE_PATH = 'location_cache.pkl'

def geocode_addresses(addresses):
    """Returns a list of location data. Uses pickle cache to limit the number of OpenStreetMap requests."""
    cache = {}
    if os.path.isfile(LOC_CACHE_PATH):
        with open(LOC_CACHE_PATH, 'rb') as f:
            cache = pickle.load(f)
            print(f"Loaded {len(cache)} locations from cache")

    geolocator = Nominatim(user_agent="http")
    location_data = {}
    for address in addresses:
        if address in cache:
            location = cache[address]
        else:
            print(f"{address}: geocoded")
            location = geolocator.geocode(address)
            time.sleep(1) # as per requirement at https://operations.osmfoundation.org/policies/nominatim/
            cache[address] = location
            with open(LOC_CACHE_PATH, 'wb') as f:
                pickle.dump(cache, f)
            if location is None:
                print(f"Geocoding failed for: {address}")
        in_poland = location.raw['display_name'].endswith("Polska")
        location_data[address] = {'in_poland': in_poland, 'loc': [location.latitude, location.longitude]}
    return location_data


def plot_addresses_on_map(cnt, location_data, output_file='city_map.html'):
    """Generates an html file with an interactive worldmap."""
    center_of_poland = [51.9194, 19.1451]
    total = sum(cnt.values())
    map = folium.Map(location=center_of_poland, zoom_start=6)

    for address, data in location_data.items():
        in_poland, loc = data['in_poland'], data['loc']
        occurances = cnt[address]
        tooltip = f"{address}: {occurances}"
        radius = max(2e3, occurances / total * 1e5)
        folium.Circle(location=loc, radius=radius, fill=True, tooltip=tooltip,).add_to(map)
        if not in_poland:
            folium.Marker(location=loc, color="red", tooltip=tooltip, icon=folium.Icon(icon="globe", color="darkblue")).add_to(map)

    map.save(output_file)
    print(f"Map saved to {output_file}")


def get_city_cnt(data):
    cities = []
    for offer in data:
        for place in offer['location']['places']:
            if 'city' in place:
                city = place['city'].capitalize()
                if city == "Bielsko - biała":
                    city = "Bielsko-biała"
                if city == "Warsaw":
                    city = "Warszawa"
                if city == "Lodz":
                    city = "Łódź"
                if city == "Gdansk":
                    city = "Gdańsk"
                if city == "Wroclaw":
                    city = "Wrocław"
                if city == "Poznan":
                    city = "Poznań"
                if city in ["Krakow", "Cracow"]:
                    city = "Kraków"
                if city in ["Poland"]:
                    continue
                if city == "Zabierzów k. krakowa":
                    city = "Zabierzów"
                if city in ["Remote", "Zdalnie", "Remotely pol"]:
                    city = "Remote"
                cities.append(city)
    return Counter(cities)

cnt = get_city_cnt(data)
del cnt["Remote"]
location_data = geocode_addresses(cnt.keys())
plot_addresses_on_map(cnt, location_data)

**Benefits**

In [None]:
benefits_synonym_inv_map = {
    'Sport subscription': ['Sport Subscription', 'Multisport card', 'Multisport', ],
    'Insurance': ['Life & group insurance', 'Insurance'],
    'Language classes': ['English lessons'],
    'Training budget': ['Trainings', ]
}
benefits_synonym_map = {syn: disambiguation for disambiguation, synset in benefits_synonym_inv_map.items() for syn in synset}


def disambiguate_benefits(benefits: list[str]) -> list[str]:
    return [benefits_synonym_map.get(b, b) for b in benefits]


def get_benefits_and_office_perks() -> tuple[Counter, Counter]:
    global data
    benefits_counter = Counter()
    office_perks_counter = Counter()
    for item in data:
        benefits = disambiguate_benefits(item['benefits']['benefits'])
        office_perks = item['benefits']['officePerks']
        benefits_counter += Counter(benefits)
        office_perks_counter += Counter(office_perks)
    return benefits_counter, office_perks_counter


benefits_counter, office_perks_counter = get_benefits_and_office_perks()

In [None]:
def plot_benefits(benefit_counter, title, col="jet", topn=16):
    names = list(reversed(sorted(list(benefit_counter), key=lambda x: -benefit_counter[x])[:topn]))
    values = [benefit_counter[name] for name in names]

    if col == "jet":
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.jet(gradient)
    else:
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.RdYlBu_r(gradient)
    
    plt.figure(figsize=(12, len(names) // 4))
    bars = plt.barh(names, values, color=colors)

    for bar, value in zip(bars, values):
        plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{value}',
                 va='center', ha='left')

    plt.margins(y=0.01)
    plt.xlabel('Appearances')
    plt.ylabel('Perk')
    plt.title(title)

In [None]:
plot_benefits(benefits_counter, title='Benefits')

In [None]:
plot_benefits(office_perks_counter, title='Office Perks', col='RdYlBu_r')

**Titles**

In [None]:
title_synonym_inv_map = {
    'DevOps Engineer': ['Senior DevOps Engineer',],
    'Data Engineer': ['Senior Data Engineer', ],
    'Software Developer': ['Senior Software Engineer', ],
    'Frontend Developer': ['Senior Frontend Developer', ],
    'Product Manager': ['Senior Product Manager', ],
    'Java Developer': ['Senior Java Developer', 'Java Software Engineer']
}
title_synonym_map = {syn: disambiguation for disambiguation, synset in title_synonym_inv_map.items() for syn in synset}

def get_title_counter() -> Counter:
    global data
    title_counter = Counter()
    for item in data:
        title = title_synonym_map.get(item['title'], item['title'])
        title_counter[title] += 1
    return title_counter

title_counter = get_title_counter()

In [None]:
def plot_titles(title_counter, topn=16):
    names = list(reversed(sorted(list(title_counter), key=lambda x: -title_counter[x])[:topn]))
    values = [title_counter[name] for name in names]

    gradient = np.linspace(0, 0.7, len(names))
    colors = plt.cm.inferno_r(gradient)
    
    plt.figure(figsize=(12, len(names) // 4))
    bars = plt.barh(names, values, color=colors)

    for bar, value in zip(bars, values):
        plt.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2, f'{value}',
                 va='center', ha='left')

    plt.margins(y=0.01)
    plt.xlabel('Appearances')
    plt.ylabel('Title')
    plt.title('Job Titles')

plot_titles(title_counter)

**Description word cloud**

In [None]:
# !pip install wordcloud

In [None]:
requirements_desc_joined = " ".join(item['requirements']['description'] for item in data)

In [None]:
details_desc_joined = " ".join(item['details']['description'] for item in data)

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = (
    STOPWORDS | 
    {'li', 'ul', 'strong', 'p', 'w', 'b', 'br', 'h3', 'h2', 'h1', 'u', 'amp', 's'} |  # html (?)
    {'się', 'tym', 'z', 'e.g.', 'eg', 'etc', 'np.', 'np', 'em', 'will', 'll', 'na', 'jak', 'jest', 're'}  # other stopwords
)

In [None]:
# requirements

requirements_wordcloud = WordCloud(width=2400, height=1600, stopwords=stopwords, background_color="white").generate(requirements_desc_joined)

plt.figure(figsize=(18, 14))
plt.imshow(requirements_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# details

details_wordcloud = WordCloud(width=2400, height=1600, stopwords=stopwords, background_color="white").generate(details_desc_joined)

plt.figure(figsize=(18, 14))
plt.imshow(details_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()