In [12]:
# ! pip install streamlit
# ! pip install --upgrade --quiet unstructured
# ! pip install langchain_community
# ! pip install unstructured
# ! pip install openai

In [13]:
#Imports

import requests
import os
import re
import ast
import openai
import time
import random
import json
import pandas as pd
from langchain_community.document_loaders import UnstructuredURLLoader
import streamlit as st
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
from datetime import date
import plotly.express as px
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# Project constants

today = str(date.today())

with open("/content/api_key.txt", "r") as file:
    key = file.read().strip()

model = "gpt-4o"

# Scraping and Downloading Data From Files

In [15]:
# Djinni allows to scrape only the first 15 vacancies:(
# At least something unlike DOU

def scrape_list_page():
    page_url = f"https://djinni.co/jobs/?primary_keyword=Data+Science&primary_keyword=ML+AI&exp_level=2y&exp_level=3y&exp_level=4y&page=1"
    try:
        # Send an HTTP GET request to the current page, as far as it successful parsing continues
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all anchor tags with the specified class
            job_links = soup.select('a.job-item__title-link')
            # Extract the 'href' attribute from each link and supplement it to the full URL
            start = 'https://djinni.co'
            job_urls = [start + link['href'] for link in job_links if 'href' in link.attrs]
            # Append the URLs to the list
            return list(set(job_urls))
        else:
            print(f"Non-200 response for {page_url}: {response.status_code}")
    except Exception as e:
        print(f"Error occurred on {page_url}: {e}")

In [16]:
# Djinni blocked our IP address entirely:(
# But we still have some vacancies in the bucket:D
def getting_urls():
    try:
        scraped_urls = scrape_list_page()
        if len(scraped_urls) > 0:
            print(f'There are {len(scraped_urls)} URLs scraped!')
            return scraped_urls
        else:
            scraped_urls = []
            return scraped_urls
    except Exception as e:
        print(f"An error occurred while scraping: {e}")
        scraped_urls = []
        return scraped_urls

scraped_urls = getting_urls()

There are 15 URLs scraped!


In [17]:
# Let's extract from downloaded data links to the vacancies via regex expression

folder_path = "/content/drive/MyDrive/Set_Uni"
urls_from_txts = []
# Regex pattern to find HTTPS URLs
# Matches URLs starting with http or https
url_pattern = r"https?://[^\s]+"

# Loop over all files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if file_name.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            urls = re.findall(url_pattern, content)
            if urls:
                urls_from_txts.append(urls[0])

print(f"There are {len(urls_from_txts)} extracted URLs.")

There are 82 extracted URLs.


In [18]:
#############Alternative: Downloading from the S3 Bucket###############
# Define S3 bucket and folders
# bucket_name = "raw-job-descriptions"
# s3_folder = "raw-txt-job/"
# local_folder = "job_data/"

# # Ensure the local folder exists
# os.makedirs(local_folder, exist_ok=True)

# # List files in the S3 folder
# response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)

# # Download each file
# if 'Contents' in response:
#     for obj in response['Contents']:
#         file_key = obj['Key']
#         file_name = file_key.split("/")[-1]
#         local_file_path = os.path.join(local_folder, file_name)
#         if not file_name:
#             continue
#         print(f"Downloading {file_key} to {local_file_path}...")
#         s3.download_file(bucket_name, file_key, local_file_path)
#     print("Download complete.")
# else:
#     print("No files found in the specified folder.")

In [19]:
total_urls = scraped_urls + urls_from_txts
print(f'Totally we have {len(total_urls)} URLs.')

Totally we have 97 URLs.


In [20]:
# Let's load pages with the help of Langchain UnstructuredURLLoader
if total_urls:
    try:
        loader = UnstructuredURLLoader(urls=total_urls)
        data_scraped = loader.load()
        print('Example:', data_scraped[0])
        print("\nThe unstructured data has been successfully loaded!")
    except Exception as e:
        print(f"Error loading data from URLs: {e}")
else:
    print("\nNo URLs extracted. Skipping UnstructuredURLLoader:(")

Example: page_content='Error

All jobs

Data and Analytics

Data Science

ML / AI

Kyiv

AI Engineer

PlayMe Studio

About the role:

We are seeking a talented AI Engineer to join our team and drive the development of the core of our web AI product. As an AI Engineer, you will play a pivotal role in shaping the future of our product by being responsible for the LLM stack, its improvement, and deployment.

Areas of responsibility:

Designing and implementing LLM API integrations with various providers (e.g., OpenAI, Anthropic, Cohere, Meta);

Developing and optimizing prompt engineering strategies to enhance model performance and ensure consistent, high-quality outputs;

Creating and maintaining multi-agent and multi-modal workflows (text, image, speech generation);

Implementing safeguards and validation mechanisms to prevent hallucinations and ensure compliance with regulations;

​​Implementing and optimizing image generation capabilities with character consistency (custom ComfyUI wor

# Information extraction

In [21]:
# Prompt for important information extraction (with examples)

def prompting(job_description):
    prompt_general = f"""Your task is to analyze the current job market state.
    To do so read the job description and return JSON with specific information.
    The description may be written in Ukrainian, English, or a mix of both.
    It may contain short info from other job descriptions, ignore it, and focus on the main topic.
    Instructions:
    1. Extract the following details from each job description:
    - Company name
    - Title
    - Hard skills required (specific technologies, programming languages, frameworks, tools)
    - Soft skills required (e.g., English proficiency, communication skills, teamwork)
    - Education requirements (whether a degree is required or if experience alone is sufficient)
    2. Return information from a job description, strictly following the format of the examples below.
    Examples:
    {{
      "company_name": "UkrSoft",
      "title_name": "Middle Machine Learning Engineer",
      "hard_skills_technologies": [
        "Python", "R", "Scikit-learn", "XGBoost", "CatBoost", "TensorFlow",
        "SQL", "Spark", "Регресії", "дерева рішень", "статистичні методи",
        "image recognition algorithms", "deploying image recognition algorithms", "deep learning"
      ],
      "soft_skills": ["advanced English", "profound communication skills"],
      "education": "MS in mathematics or statistics"
    }},

    {{
      "company_name": "HardServe",
      "title_name": "Junior Data Scientist",
      "hard_skills_technologies": [
        "LLM proficiency", "evaluate model performance", "MySQL", "PostgreSQL", "SAP IQ",
        "ETL сервіси AWS", "Tableau REST API", "Hugging Face Transformers",
        "Machine learning algorithms", "Anthropic Models", "Pandas",
        "Apache Spark (pyspark)", "Apache Kafka", "Git", "метрики класифікації"
      ],
      "soft_skills": ["being a team player", "excellent presenting skills"],
      "education": "at least 1 year of experience"
    }}
    Job description: {job_description}"""
    return prompt_general

In [22]:
# Function for information extraction and response parsing

def llm_parsing(description):
    """
    Analyzes and parses scraped data according to the example
    """
    client = openai.OpenAI(api_key=key)
    response = client.chat.completions.create(
        model=model,
        messages = [
        {
            "role": "system",
            "content": ("""You are an expert in LLMs, machine learning, and data science.
                Analyze the provided description and return your answer strictly
                as valid JSON with no extra words.""")},
        {"role": "user", "content": prompting(description)}],
        temperature=0.1)
    content = response.choices[0].message.content
    content = content.replace("```", "").strip()
    try:
        # Attempt to find the first curly brace and slice from there
        json_start = content.index("{")
        json_text = content[json_start:]
        parsed_output = json.loads(json_text)
    except (ValueError, json.JSONDecodeError) as e:
        print("Error parsing JSON (explicit method):", e)
        return None
    return parsed_output

In [23]:
# Create analytics dataframe for the further analysis

parsed_info_list = []
for d in data_scraped:
    result = llm_parsing(d.page_content)
    if result is not None:
        parsed_info_list.append(result)
df = pd.DataFrame(parsed_info_list)
print(f"Shape of the created analytics dataframe is {df.shape}")
df[:5]

Shape of the created analytics dataframe is (97, 5)


Unnamed: 0,company_name,title_name,hard_skills_technologies,soft_skills,education
0,PlayMe Studio,AI Engineer,"[LLM APIs, AI-powered features, prompt enginee...",[strong understanding of the Product Developme...,3+ years of experience
1,LL Capital,Quant Researcher,"[mathematical modeling, statistical modeling, ...",[],"MSc or PhD in Statistics, Mathematics, Compute..."
2,Respaid,ML engineer • Data automation specialist,"[OPENAI, BERT, CLAUDE, GEMINI, STT, TTS, Pytho...",[Intermediate English],"from 2 years of experience, considering with 1..."
3,Ocean Script,Backend Developer (Python) with experience in NLP,"[Python, FastAPI, Flask, Django, NLP models, O...",[],2+ years of experience
4,EveryMatrix,AI/ML Lead Engineer,"[Python, PyTorch, TensorFlow, scikit-learn, La...","[excellent communication skills, English: Uppe...","Master’s in Data Science (or related field), o..."


In [24]:
# Drop duplicates in case of identical vcancies

print(f'Original dataset shape: {df.shape}')
df = df[~df.duplicated(subset=['company_name', 'title_name'], keep=False)]
print(f'Shape after dropping duplicates: {df.shape}')

Original dataset shape: (97, 5)
Shape after dropping duplicates: (75, 5)


In [25]:
# Filter out rows with only nan values

print(f'Original dataset shape: {df.shape}')
df = df.dropna(how="all")
print(f'Shape after dropping nans: {df.shape}')

Original dataset shape: (75, 5)
Shape after dropping nans: (74, 5)


In [26]:
# Function for analysis (based on analytics dataframe)

def llm_analytics(prom):
    """
    Analyzes and parses data from the analytics table
    """
    client = openai.OpenAI(api_key=key)
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": """You are an expert in LLMs, machine learning, and data science.
                You return your answers strictly as valid JSON with no extra words."""},
                  {"role": "user", "content": prom}], temperature=0.1)
    content = response.choices[0].message.content
    #Remove potential markdown formatting (like ```json ... ```)
    content = content.replace("```", "").strip()
    try:
        # Attempt to find the first curly brace and slice from there
        json_start = content.index("{")
        json_text = content[json_start:]
        parsed_output = json.loads(json_text)
    except (ValueError, json.JSONDecodeError) as e:
        print("Error parsing JSON (explicit method):", e)
        return None
    return parsed_output

# Popularity of Data-Related Job Titles in the Market (Pie Chart)

In [27]:
job_titles = df['title_name'].to_list()
job_titles_str = ", ".join(job_titles)

pie_chart_title = f"""You are given a list of job titles extracted from job descriptions: {job_titles_str}.
### Task:
1. **Categorize** each job title into one of the following six labels:
   - 'Machine Learning Engineer'
   - 'Data Scientist'
   - 'NLP/LLM-related'
   - 'CV Engineer'
   - 'Other'
2. **Count the occurrences** of each category.
### Output:
- Return a **JSON object** showing the count of job titles per category.
- **Format the JSON exactly** as shown below, with no extra words or explanations:
Example output:
{{"Machine Learning Engineer": 5, "Data Scientist": 3, "NLP/LLM-related": 2, "CV Engineer": 1, "Data Analyst": 0, "Other": 2}}"""

max_retries = 3
attempt = 0
pie_chart_data = None

while attempt < max_retries:
    try:
        pie_chart_data = llm_analytics(pie_chart_title)
        if pie_chart_data is not None:
            print("Function executed successfully!")
            break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
    attempt += 1
if pie_chart_data is None:
    print("Function failed after 3 attempts.")

Function executed successfully!


# Education Requirements (Donuts Charts)

In [28]:
requirement_education = df['education'].to_list()
job_education_str = ", ".join(requirement_education)

donut_education = f"""
You are given a list of company requirements regarding the education level of candidates: {job_education_str}.

### **Task:**
1. **Classify each requirement** into one of the following three categories:
   - **"Higher_education"** → If the job explicitly requires a higher education degree.
   - **"Just_experience"** → If the job focuses only on experience and does not require higher education.
   - **"Both_required"** → If both higher education and experience are mandatory.

2. **Count the occurrences** of each category.

### **Output:**
- Return a **JSON object** showing the frequency of each category.
- **Strictly follow this format**, with no extra words or explanations:

Example output: {{"Higher_education": 5, "Just_experience": 6, "Both_required": 2}}"""

max_retries = 3
attempt = 0
donut_data = None

while attempt < max_retries:
    try:
        donut_data = llm_analytics(donut_education)
        if donut_data is not None:
            print("Function executed successfully!")
            break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
    attempt += 1
if donut_data is None:
    print("Function failed after 3 attempts.")

Function executed successfully!


# Hard skills (Barplot)

In [30]:
hard_skills = df['hard_skills_technologies'].to_list()
flattened_list = [item for sublist in hard_skills for item in sublist]
hard_skills_str = ", ".join(flattened_list)

prompt_bar_plot = f"""You are given a dataset scraped from job vacancies,
containing technologies and hard skills required for Data Science, Machine Learning, and NLP/LLM roles.

### **Your Task:**
1. **Count** the occurrences of each unique skill or technology.
2. **Normalize Similar Skills**:
   - If two skills have very similar meanings (e.g., "Model Deployment" and "Model Implementation in Production"), **group them into one category** and sum their occurrences.
   - Consider naming variations such as "Python Scripting" and "Python" as the same skill ("Python").
3. **Return a JSON Object** where:
   - **Keys** are the normalized technology/skill names.
   - **Values** represent the number of times each skill/technology was mentioned.
   - **Use properly escaped double quotes** for all keys and string values.
4. **Output Only JSON**:
   - **Do not add explanations, summaries, or extra text.**
   - Your response should strictly follow the JSON format.

### **Example Output Format:**
{{
  "Python": 11,
  "R": 1,
  "Scikit-learn": 4,
  "Anthropic Models": 3,
  "Model Deployment": 7,
  "TensorFlow": 2,
  "CatBoost": 1,
  "PostgreSQL": 2,
  "Pandas": 5,
  "RAG": 4
}}
List with technologies/hard skills: {hard_skills_str}"""

max_retries = 3
attempt = 0
bar_plot_data = None

while attempt < max_retries:
    try:
        bar_plot_data = llm_analytics(prompt_bar_plot)
        if bar_plot_data is not None:
            print("Function executed successfully!")
            break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
    attempt += 1
if bar_plot_data is None:
    print("Function failed after 3 attempts.")

In [None]:
df_hard = (
    pd.DataFrame({"Technologies": list(bar_plot_data.keys()), "Number": list(bar_plot_data.values())})
    .sort_values(by="Number", ascending=False)
    .reset_index(drop=True))
df_hard[:3]

# Soft skills (Vertical Progress Bars)

In [None]:
soft_skills = df['soft_skills'].to_list()
flattened_list = [item for sublist in soft_skills for item in sublist]
soft_skills_str = ", ".join(flattened_list)

prompt_horiz_bar = f"""You are given a list of soft skills scraped from job vacancies
 in the fields of Data Science, Machine Learning, and NLP/LLM.

### **Your Task:**
1. **Count** the occurrences of each unique soft skill.
2. **Normalize Similar Skills**:
   - Group skills with similar meanings (e.g., "Good communication skills" and "Ability to communicate effectively") under a single category.
   - Ensure consistent naming conventions (e.g., "Excellent presenting skills" vs. "Strong presentation skills").
3. **Return a Dictionary** where:
   - **Keys** are the normalized soft skill names.
   - **Values** represent the number of times each skill was mentioned.
   - The dictionary **must be properly formatted for parsing**.
4. **Strict Output Formatting**:
   - **Do not add any extra words, explanations, or text**.
   - The response should be **a clean, parsing-ready dictionary**.

### **Example Output Format:**
{{
    "Advanced English": 11,
    "Excellent presenting skills": 1,
    "Being a team player": 4,
    "Good communication skills": 3
}}
List of soft skills: {soft_skills_str}
"""

max_retries = 3
attempt = 0
bar_plot_horiz_data = None

while attempt < max_retries:
    try:
        bar_plot_horiz_data = llm_analytics(prompt_horiz_bar)
        if bar_plot_horiz_data is not None:
            print("Function executed successfully!")
            break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
    attempt += 1
if bar_plot_horiz_data is None:
    print("Function failed after 3 attempts.")

In [None]:
df_soft = (
    pd.DataFrame({"Soft_skills": list(bar_plot_horiz_data.keys()), "Number": list(bar_plot_horiz_data.values())})
    .sort_values(by="Number", ascending=False).reset_index(drop=True))

df_soft[:3]

# Saving All Analytics

In [None]:
# Saving files localy

path = '/content/drive/MyDrive/Set_Uni_Analytics/'
analytics_name = path + 'analytics' + today + '.csv'
soft_skills_name = path + 'soft_skills' + today + '.csv'
hard_skills_name = path + 'hard_skills' + today + '.csv'
donut_data_name = path + 'donut_data.json'
pie_chart_data_name = path + 'pie_chart_data.json'

with open(donut_data_name, "w", encoding="utf-8") as f:
    json.dump(donut_data, f, indent=4)

with open(pie_chart_data_name, "w", encoding="utf-8") as f:
    json.dump(pie_chart_data, f, indent=4)

df.to_csv(analytics_name)
df_soft.to_csv(soft_skills_name, index=False)
df_hard.to_csv(hard_skills_name, index=False)
print("Files were successfully saved!")

In [None]:
# Uploading fresh analytics to the S3 bucket

BUCKET_NAME = "data-job-market-monitoring"
LOCAL_FOLDER = "job_analytics/"
S3_FOLDER = "job_analytics" + today + "/"

def upload_files_to_s3():
    """
    Upload all files from LOCAL_FOLDER to S3.
    """
    if not os.path.exists(LOCAL_FOLDER):
        print(f"Error: Folder '{LOCAL_FOLDER}' does not exist!")
        return
    # Walk through local folder and upload each file
    for root, _, files in os.walk(LOCAL_FOLDER):
        for file_name in files:
            local_file_path = os.path.join(root, file_name)
            s3_key = os.path.relpath(local_file_path, LOCAL_FOLDER)
            s3_key = os.path.join(S3_FOLDER, s3_key).replace("\\", "/")
            try:
                print(f"Uploading {local_file_path} to s3://{BUCKET_NAME}/{s3_key}...")
                s3.upload_file(local_file_path, BUCKET_NAME, s3_key)
                print("Upload successful!")
            except Exception as e:
                print(f"Failed to upload {local_file_path}: {e}")
    print("\nAll files have been uploaded to S3!")

# Run the upload function
upload_files_to_s3()