In [None]:
import json
from collections import Counter, defaultdict
from tabulate import tabulate
import matplotlib.pyplot as plt
import numpy as np

data = None
with open('./data/result_offers.json', 'r', encoding="utf-8") as f:
    text = f.read()
    data = json.loads(text)
# print(json.dumps(data[0], indent=4, ensure_ascii=False))

musts = []
nices = []
langs = []
for item in data:
    musts.extend(item["requirements"]["musts"])
    nices.extend(item["requirements"]["nices"])
    langs.extend(item["requirements"]["languages"])
print(f"Offers: {len(data)}\nMust have requirements: {len(musts)}\nNice to have requirements {len(nices)}")

In [None]:
def get_avg_salary(item):
    currency = item["essentials"]["salary"]["currency"]
    try:
        range = item["essentials"]["salary"]["types"]["permanent"]["range"]
        return {"contract": "permanent", "currency":  currency, "avg": sum(map(int, range)) / len(range)}
    except:
        pass
    try:
        range = item["essentials"]["salary"]["types"]["b2b"]["range"]
        return {"contract": "b2b", "currency":  currency, "avg": sum(map(int, range)) / len(range)}
    except:
        # print(item["essentials"]["salary"]["types"])
        return None


def plot_against_salary(data, title, box_contract = None):
    if box_contract not in [None, "permanent", "b2b"]:
        raise ValueError("Invalid box_contract value")
    
    grouped = defaultdict(lambda: [])
    for item in data:
        attr = "Remote" if item["location"]["remote"] else "Non-remote"
        salary = get_avg_salary(item)
        if salary is None:
            continue
        if (salary['avg'] < 10):
            raise ValueError("Supiciously small salary", item)
        if (salary["currency"] != "PLN"):
            raise ValueError("Found non-PLN salary range")
        grouped[attr].append(salary)

    fig, axes = plt.subplots(nrows=len(grouped), ncols=1, figsize=(10, 6.5))
    fig.subplots_adjust(hspace=0.4)

    fig.suptitle(title)
    for [ax, [key, group]] in zip(axes, grouped.items()):
        permanent_means = [salary["avg"] for salary in group if salary["contract"] == "permanent"]
        b2b_means = [salary["avg"] for salary in group if salary["contract"] == "b2b"]
        labels = ["permanent", "b2b"]
        colors = ["C0", "C1"]
        for i, means in enumerate([permanent_means, b2b_means]): 
            print(f"Plotting {len(means)} {key} {labels[i]} records.")
            if box_contract:
                if labels[i] != box_contract:
                    continue
                ax.boxplot(means, vert=False)
                ax.get_yaxis().set_visible(False)
                ax.set_title(f"{key} {box_contract}")
            else:
                counts, bins = np.histogram(means, 20)
                ax.stairs(counts, bins, label=labels[i], color=colors[i])
                ax.axvline(np.mean(means), linestyle='dashed', color=colors[i], linewidth=1, label=f"{labels[i]} mean")
                ax.set_ylabel("Number of occurances")
                ax.set_title(key)
                ax.legend()

        ax.set_xlim(-1, 55_000)
        ax.set_xlabel("Range average salary [PLN]")


plot_against_salary(data, title="Salary vs remote work")
# plot_against_salary(data, box_contract="b2b", title="B2b salary vs remote work")

In [None]:
def plot_reqs(reqs, title, col="plasma"):
    counter = Counter([m["value"] for m in reqs])
    names = sorted(list(counter), key=lambda x: -counter[x])[:50]
    names = list(reversed(names))
    values = [counter[name] for name in names]

    if col == "plasma":
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.plasma(gradient)
    else:
        gradient = np.linspace(0, 0.7, len(names))
        colors = plt.cm.viridis(gradient)
        
    plt.figure(figsize=(12, len(names) // 4))
    bars = plt.barh(names, values, color=colors)

    # Adding values on bars
    for bar, value in zip(bars, values):
        plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, f'{value}',
                 va='center', ha='left')

    plt.margins(y=0.01)
    plt.xlabel('Appearances')
    plt.ylabel('Skill')
    plt.title(title)

plot_reqs(musts, "Must have skills appearances", col="plasma")
plot_reqs(nices, "Nice to have skill appearances", col="viridis")

In [None]:
print(len(langs), len(data), f"\nAvg lang per offer: {len(langs) / len(data):.3f}\n")
counter = Counter([lang["code"] for lang in langs])
names = sorted(list(counter), key=lambda x: -counter[x])
levels = ["NA", "A1", "A2", "B1", "B2", "C1", "C2", "NATIVE"]
table = [[level] + [len([lang for lang in langs if lang["code"] == code and lang.get("level", "NA") == level]) for code in names] for level in levels]
table.append(["Total"] + [counter[x] for x in names])
print(tabulate(table, headers=[""] + names))