In [1]:
import os
import sys
import math

import subprocess
import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
df_path = '/Users/zygimantas/Documents/DataSets/future_jobs_dataset.csv'

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/19 08:13:21 WARN Utils: Your hostname, Zygimantass-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.43.73.162 instead (on interface en0)
25/12/19 08:13:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/19 08:13:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df_jobs_demand = spark.read.csv(df_path, header=True, inferSchema=True)

In [6]:
df_jobs_demand.show(truncate=False, n=10)

+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|job_id|job_title                 |industry         |location |salary_usd|skills_required                       |remote_option|company_size|posting_date|
+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|1     |Quantum Researcher        |Quantum Computing|Singapore|175780    |Linear Algebra, Quantum Algorithms    |No           |Large       |2025-07-22  |
|2     |Renewable Energy Engineer |Green Tech       |Singapore|137481    |Climate Data Analysis, Energy Modeling|Yes          |Large       |2025-09-26  |
|3     |Quantum Researcher        |Quantum Computing|Tokyo    |182081    |Linear Algebra, Qiskit                |No           |Medium      |2025-12-31  |
|4     |Sustainability Analyst    |Green Tech       |Singapore|113822    |Cl

In [7]:
df_jobs_demand.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- job_title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_usd: integer (nullable = true)
 |-- skills_required: string (nullable = true)
 |-- remote_option: string (nullable = true)
 |-- company_size: string (nullable = true)
 |-- posting_date: date (nullable = true)



In [8]:
print(df_jobs_demand.show(truncate=False, n=10))

+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|job_id|job_title                 |industry         |location |salary_usd|skills_required                       |remote_option|company_size|posting_date|
+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|1     |Quantum Researcher        |Quantum Computing|Singapore|175780    |Linear Algebra, Quantum Algorithms    |No           |Large       |2025-07-22  |
|2     |Renewable Energy Engineer |Green Tech       |Singapore|137481    |Climate Data Analysis, Energy Modeling|Yes          |Large       |2025-09-26  |
|3     |Quantum Researcher        |Quantum Computing|Tokyo    |182081    |Linear Algebra, Qiskit                |No           |Medium      |2025-12-31  |
|4     |Sustainability Analyst    |Green Tech       |Singapore|113822    |Cl

In [9]:
df_jobs_demand.select(
    'job_title', 'industry', 'salary_usd'
).show(10)

+--------------------+-----------------+----------+
|           job_title|         industry|salary_usd|
+--------------------+-----------------+----------+
|  Quantum Researcher|Quantum Computing|    175780|
|Renewable Energy ...|       Green Tech|    137481|
|  Quantum Researcher|Quantum Computing|    182081|
|Sustainability An...|       Green Tech|    113822|
|Smart Contract En...|       Blockchain|     92575|
|Smart Contract En...|       Blockchain|    173379|
|Renewable Energy ...|       Green Tech|     99659|
|Quantum Software ...|Quantum Computing|    210842|
|Sustainability An...|       Green Tech|    189475|
|Blockchain Developer|       Blockchain|    228992|
+--------------------+-----------------+----------+
only showing top 10 rows


In [10]:
df_jobs_demand.filter(
    F.col('salary_usd') > 150_000
).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     6|Smart Contract En...|       Blockchain|    Tokyo|    173379|      Solidity, Rust|          Yes|      Medium|  2025-08-10|
|     8|Quantum Software ...|Quantum Computing|   London|    210842|Qiskit, Quantum A...|          Yes|       Large|  2025-04-13|
|     9|Sustainability An...|       Green Tech| New York|    189475|Climate Data Anal...| 

In [11]:
df_jobs_demand.where(
    F.col('salary_usd') > 150_000
).count()

5018

In [12]:
df_jobs_demand.createOrReplaceTempView('jobs_demand')

In [13]:
spark.sql("""
    SELECT * FROM jobs_demand
        WHERE salary_usd > 150000
""").show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     6|Smart Contract En...|       Blockchain|    Tokyo|    173379|      Solidity, Rust|          Yes|      Medium|  2025-08-10|
|     8|Quantum Software ...|Quantum Computing|   London|    210842|Qiskit, Quantum A...|          Yes|       Large|  2025-04-13|
|     9|Sustainability An...|       Green Tech| New York|    189475|Climate Data Anal...| 

In [14]:
df_jobs_demand.select(
    F.col('industry')
).distinct().show()

+-----------------+
|         industry|
+-----------------+
|Quantum Computing|
|               AI|
|       Green Tech|
|       Blockchain|
+-----------------+



In [15]:
spark.sql("""
    SELECT distinct(*) FROM jobs_demand
""").show()

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|   320|Sustainability An...|       Green Tech| New York|    102711|Climate Data Anal...|           No|       Large|  2025-08-08|
|   497|  Quantum Researcher|Quantum Computing|    Tokyo|    218314|Qiskit, Linear Al...|           No|       Small|  2025-03-24|
|   552|Renewable Energy ...|       Green Tech|    Dubai|    164148|Energy Modeling, ...|          Yes|       Small|  2025-08-29|
|   804|Smart Contract En...|       Blockchain| New York|    222966|  Solidity, Ethereum|          Yes|      Medium|  2025-05-17|
|  1090|Renewable Energy ...|       Green Tech|   London|    114249|Climate Data Anal...| 

In [16]:
spark.sql("""
    select count(*) from jobs_demand
""").show()

+--------+
|count(1)|
+--------+
|   10000|
+--------+



In [17]:
df_jobs_demand.count()

10000

In [18]:
df_jobs_demand.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- job_title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_usd: integer (nullable = true)
 |-- skills_required: string (nullable = true)
 |-- remote_option: string (nullable = true)
 |-- company_size: string (nullable = true)
 |-- posting_date: date (nullable = true)



In [19]:
df_jobs_demand.show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     2|Renewable Energy ...|       Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     4|Sustainability An...|       Green Tech|Singapore|    113822|Climate Data Anal...|           No|       Large|  2025-05-29|
|     5|Smart Contract En...|       Blockchain|   London|     92575|      Rust, Solidity| 

In [20]:
df_jobs_demand.filter(
    (F.col('remote_option') == 'Yes') & (F.col('location') == 'Singapore')
).show(5)

+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|     2|Renewable Energy ...|Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|    12|         AI Engineer|        AI|Singapore|    136880| TensorFlow, PyTorch|          Yes|       Small|  2025-03-15|
|    13|      Data Scientist|        AI|Singapore|     50387|  Python, TensorFlow|          Yes|      Medium|  2025-03-02|
|    18|      Data Scientist|        AI|Singapore|    190743|  TensorFlow, Python|          Yes|       Small|  2025-10-07|
|    47|Renewable Energy ...|Green Tech|Singapore|    188452|Energy Modeling, ...|          Yes|       Small|  2025-01-19|
+------+--------

In [21]:
spark.sql("""
    select * from jobs_demand
        where remote_option = 'Yes' and location = 'Singapore'
""").show(5)

+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|     2|Renewable Energy ...|Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|    12|         AI Engineer|        AI|Singapore|    136880| TensorFlow, PyTorch|          Yes|       Small|  2025-03-15|
|    13|      Data Scientist|        AI|Singapore|     50387|  Python, TensorFlow|          Yes|      Medium|  2025-03-02|
|    18|      Data Scientist|        AI|Singapore|    190743|  TensorFlow, Python|          Yes|       Small|  2025-10-07|
|    47|Renewable Energy ...|Green Tech|Singapore|    188452|Energy Modeling, ...|          Yes|       Small|  2025-01-19|
+------+--------

In [22]:
df_jobs_demand.orderBy(
    F.col('posting_date').desc()
).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [23]:
df_jobs_demand.orderBy('posting_date', ascending=False).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [24]:
df_jobs_demand.orderBy('posting_date', ascending=True).show(5)

+------+--------------------+----------+--------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry|location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+--------+----------+--------------------+-------------+------------+------------+
|   797|      Data Scientist|        AI|   Dubai|    212432|     PyTorch, Python|           No|       Small|  2025-01-01|
|  2376|Sustainability An...|Green Tech|  London|    169718|Climate Data Anal...|          Yes|      Medium|  2025-01-01|
|  1208|Blockchain Developer|Blockchain|  Berlin|     67727|  Ethereum, Solidity|          Yes|      Medium|  2025-01-01|
|   621|Renewable Energy ...|Green Tech|New York|     63564|Climate Data Anal...|          Yes|       Large|  2025-01-01|
|  1314|       ML Researcher|        AI|  London|    127411|  TensorFlow, Python|           No|      Medium|  2025-01-01|
+------+----------------

In [25]:
spark.sql("""
    select * from jobs_demand
        order by posting_date desc
""").show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [28]:
df_jobs_demand = df_jobs_demand.withColumnRenamed(
    'salary_usd', 'annual_salary'
)

In [30]:
spark.sql("""
    SELECT *, salary_usd AS annual_salary
    FROM jobs_demand
""").createOrReplaceTempView('jobs_demand')