In [1]:
import os
import sys
import math

import subprocess
import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from prompt_toolkit.styles.style import default_priority
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
df_path = '/Users/zygimantas/Documents/DataSets/future_jobs_dataset.csv'

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/19 13:12:00 WARN Utils: Your hostname, Zygimantass-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.43.73.162 instead (on interface en0)
25/12/19 13:12:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/19 13:12:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/19 13:12:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df_jobs_demand = spark.read.csv(df_path, header=True, inferSchema=True)

In [6]:
df_jobs_demand.show(truncate=False, n=10)

+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|job_id|job_title                 |industry         |location |salary_usd|skills_required                       |remote_option|company_size|posting_date|
+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|1     |Quantum Researcher        |Quantum Computing|Singapore|175780    |Linear Algebra, Quantum Algorithms    |No           |Large       |2025-07-22  |
|2     |Renewable Energy Engineer |Green Tech       |Singapore|137481    |Climate Data Analysis, Energy Modeling|Yes          |Large       |2025-09-26  |
|3     |Quantum Researcher        |Quantum Computing|Tokyo    |182081    |Linear Algebra, Qiskit                |No           |Medium      |2025-12-31  |
|4     |Sustainability Analyst    |Green Tech       |Singapore|113822    |Cl

In [7]:
df_jobs_demand.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- job_title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_usd: integer (nullable = true)
 |-- skills_required: string (nullable = true)
 |-- remote_option: string (nullable = true)
 |-- company_size: string (nullable = true)
 |-- posting_date: date (nullable = true)



In [8]:
print(df_jobs_demand.show(truncate=False, n=10))

+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|job_id|job_title                 |industry         |location |salary_usd|skills_required                       |remote_option|company_size|posting_date|
+------+--------------------------+-----------------+---------+----------+--------------------------------------+-------------+------------+------------+
|1     |Quantum Researcher        |Quantum Computing|Singapore|175780    |Linear Algebra, Quantum Algorithms    |No           |Large       |2025-07-22  |
|2     |Renewable Energy Engineer |Green Tech       |Singapore|137481    |Climate Data Analysis, Energy Modeling|Yes          |Large       |2025-09-26  |
|3     |Quantum Researcher        |Quantum Computing|Tokyo    |182081    |Linear Algebra, Qiskit                |No           |Medium      |2025-12-31  |
|4     |Sustainability Analyst    |Green Tech       |Singapore|113822    |Cl

In [9]:
df_jobs_demand.select(
    'job_title', 'industry', 'salary_usd'
).show(10)

+--------------------+-----------------+----------+
|           job_title|         industry|salary_usd|
+--------------------+-----------------+----------+
|  Quantum Researcher|Quantum Computing|    175780|
|Renewable Energy ...|       Green Tech|    137481|
|  Quantum Researcher|Quantum Computing|    182081|
|Sustainability An...|       Green Tech|    113822|
|Smart Contract En...|       Blockchain|     92575|
|Smart Contract En...|       Blockchain|    173379|
|Renewable Energy ...|       Green Tech|     99659|
|Quantum Software ...|Quantum Computing|    210842|
|Sustainability An...|       Green Tech|    189475|
|Blockchain Developer|       Blockchain|    228992|
+--------------------+-----------------+----------+
only showing top 10 rows


In [10]:
df_jobs_demand.filter(
    F.col('salary_usd') > 150_000
).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     6|Smart Contract En...|       Blockchain|    Tokyo|    173379|      Solidity, Rust|          Yes|      Medium|  2025-08-10|
|     8|Quantum Software ...|Quantum Computing|   London|    210842|Qiskit, Quantum A...|          Yes|       Large|  2025-04-13|
|     9|Sustainability An...|       Green Tech| New York|    189475|Climate Data Anal...| 

In [11]:
df_jobs_demand.where(
    F.col('salary_usd') > 150_000
).count()

5018

In [12]:
df_jobs_demand.createOrReplaceTempView('jobs_demand')

In [13]:
spark.sql("""
    SELECT * FROM jobs_demand
        WHERE salary_usd > 150000
""").show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     6|Smart Contract En...|       Blockchain|    Tokyo|    173379|      Solidity, Rust|          Yes|      Medium|  2025-08-10|
|     8|Quantum Software ...|Quantum Computing|   London|    210842|Qiskit, Quantum A...|          Yes|       Large|  2025-04-13|
|     9|Sustainability An...|       Green Tech| New York|    189475|Climate Data Anal...| 

In [14]:
df_jobs_demand.select(
    F.col('industry')
).distinct().show()

+-----------------+
|         industry|
+-----------------+
|Quantum Computing|
|               AI|
|       Green Tech|
|       Blockchain|
+-----------------+



In [15]:
spark.sql("""
    SELECT distinct(*) FROM jobs_demand
""").show()

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|   320|Sustainability An...|       Green Tech| New York|    102711|Climate Data Anal...|           No|       Large|  2025-08-08|
|   497|  Quantum Researcher|Quantum Computing|    Tokyo|    218314|Qiskit, Linear Al...|           No|       Small|  2025-03-24|
|   552|Renewable Energy ...|       Green Tech|    Dubai|    164148|Energy Modeling, ...|          Yes|       Small|  2025-08-29|
|   804|Smart Contract En...|       Blockchain| New York|    222966|  Solidity, Ethereum|          Yes|      Medium|  2025-05-17|
|  1090|Renewable Energy ...|       Green Tech|   London|    114249|Climate Data Anal...| 

In [16]:
spark.sql("""
    select count(*) from jobs_demand
""").show()

+--------+
|count(1)|
+--------+
|   10000|
+--------+



In [17]:
df_jobs_demand.count()

10000

In [18]:
df_jobs_demand.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- job_title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_usd: integer (nullable = true)
 |-- skills_required: string (nullable = true)
 |-- remote_option: string (nullable = true)
 |-- company_size: string (nullable = true)
 |-- posting_date: date (nullable = true)



In [19]:
df_jobs_demand.show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|    175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     2|Renewable Energy ...|       Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|     4|Sustainability An...|       Green Tech|Singapore|    113822|Climate Data Anal...|           No|       Large|  2025-05-29|
|     5|Smart Contract En...|       Blockchain|   London|     92575|      Rust, Solidity| 

In [20]:
df_jobs_demand.filter(
    (F.col('remote_option') == 'Yes') & (F.col('location') == 'Singapore')
).show(5)

+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|     2|Renewable Energy ...|Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|    12|         AI Engineer|        AI|Singapore|    136880| TensorFlow, PyTorch|          Yes|       Small|  2025-03-15|
|    13|      Data Scientist|        AI|Singapore|     50387|  Python, TensorFlow|          Yes|      Medium|  2025-03-02|
|    18|      Data Scientist|        AI|Singapore|    190743|  TensorFlow, Python|          Yes|       Small|  2025-10-07|
|    47|Renewable Energy ...|Green Tech|Singapore|    188452|Energy Modeling, ...|          Yes|       Small|  2025-01-19|
+------+--------

In [21]:
spark.sql("""
    select * from jobs_demand
        where remote_option = 'Yes' and location = 'Singapore'
""").show(5)

+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+---------+----------+--------------------+-------------+------------+------------+
|     2|Renewable Energy ...|Green Tech|Singapore|    137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|
|    12|         AI Engineer|        AI|Singapore|    136880| TensorFlow, PyTorch|          Yes|       Small|  2025-03-15|
|    13|      Data Scientist|        AI|Singapore|     50387|  Python, TensorFlow|          Yes|      Medium|  2025-03-02|
|    18|      Data Scientist|        AI|Singapore|    190743|  TensorFlow, Python|          Yes|       Small|  2025-10-07|
|    47|Renewable Energy ...|Green Tech|Singapore|    188452|Energy Modeling, ...|          Yes|       Small|  2025-01-19|
+------+--------

In [22]:
df_jobs_demand.orderBy(
    F.col('posting_date').desc()
).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [23]:
df_jobs_demand.orderBy('posting_date', ascending=False).show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [24]:
df_jobs_demand.orderBy('posting_date', ascending=True).show(5)

+------+--------------------+----------+--------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|  industry|location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+----------+--------+----------+--------------------+-------------+------------+------------+
|   797|      Data Scientist|        AI|   Dubai|    212432|     PyTorch, Python|           No|       Small|  2025-01-01|
|  2376|Sustainability An...|Green Tech|  London|    169718|Climate Data Anal...|          Yes|      Medium|  2025-01-01|
|  1208|Blockchain Developer|Blockchain|  Berlin|     67727|  Ethereum, Solidity|          Yes|      Medium|  2025-01-01|
|   621|Renewable Energy ...|Green Tech|New York|     63564|Climate Data Anal...|          Yes|       Large|  2025-01-01|
|  1314|       ML Researcher|        AI|  London|    127411|  TensorFlow, Python|           No|      Medium|  2025-01-01|
+------+----------------

In [25]:
spark.sql("""
    select * from jobs_demand
        order by posting_date desc
""").show(5)

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+
|  1989|Renewable Energy ...|       Green Tech|   London|    216801|Climate Data Anal...|           No|      Medium|  2025-12-31|
|  1204|Quantum Software ...|Quantum Computing|Singapore|    136678|Linear Algebra, Q...|          Yes|       Small|  2025-12-31|
|   342|Quantum Software ...|Quantum Computing|   London|     95148|Linear Algebra, Q...|          Yes|      Medium|  2025-12-31|
|  1496|Quantum Software ...|Quantum Computing|    Dubai|     97096|Qiskit, Quantum A...|          Yes|       Large|  2025-12-31|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|    182081|Linear Algebra, Q...| 

In [26]:
df_jobs_demand = df_jobs_demand.withColumnRenamed(
    'salary_usd', 'annual_salary'
)

In [27]:
spark.sql("""
    SELECT *, salary_usd AS annual_salary
    FROM jobs_demand
""").createOrReplaceTempView('jobs_demand')

In [28]:
df_jobs_demand.filter(
    F.col('job_title') == "Quantum Researcher"
).show(5)

+------+------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+
|job_id|         job_title|         industry| location|annual_salary|     skills_required|remote_option|company_size|posting_date|
+------+------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+
|     1|Quantum Researcher|Quantum Computing|Singapore|       175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|
|     3|Quantum Researcher|Quantum Computing|    Tokyo|       182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|
|    11|Quantum Researcher|Quantum Computing| New York|       122847|Quantum Algorithm...|          Yes|       Small|  2025-02-09|
|    14|Quantum Researcher|Quantum Computing|    Dubai|       194786|Linear Algebra, Q...|           No|       Large|  2025-05-02|
|    22|Quantum Researcher|Quantum Computing| New York|       179393|Quantum Algori

In [29]:
df_jobs_demand.select(
    'job_title', 'annual_salary'
).orderBy(
    F.col('annual_salary').desc()
).show(5, truncate=False)

+-----------------------+-------------+
|job_title              |annual_salary|
+-----------------------+-------------+
|Smart Contract Engineer|249990       |
|Smart Contract Engineer|249985       |
|Smart Contract Engineer|249984       |
|Data Scientist         |249952       |
|ML Researcher          |249893       |
+-----------------------+-------------+
only showing top 5 rows


In [30]:
spark.sql("""
    select * from jobs_demand
        order by annual_salary desc
""").show(5)

+------+--------------------+----------+---------+----------+------------------+-------------+------------+------------+-------------+
|job_id|           job_title|  industry| location|salary_usd|   skills_required|remote_option|company_size|posting_date|annual_salary|
+------+--------------------+----------+---------+----------+------------------+-------------+------------+------------+-------------+
|  8633|Smart Contract En...|Blockchain| New York|    249990|Ethereum, Solidity|           No|       Large|  2025-12-06|       249990|
|  2758|Smart Contract En...|Blockchain|Singapore|    249985|    Ethereum, Rust|          Yes|       Small|  2025-02-20|       249985|
|  1648|Smart Contract En...|Blockchain|    Tokyo|    249984|Ethereum, Solidity|          Yes|       Small|  2025-03-09|       249984|
|    79|      Data Scientist|        AI|    Dubai|    249952|   PyTorch, Python|          Yes|      Medium|  2025-07-03|       249952|
|  4405|       ML Researcher|        AI|   Berlin|    2

In [31]:
df_jobs_demand.groupBy(
    'industry'
).agg(
    F.sum('annual_salary').alias('total_salary')
).show()

+-----------------+------------+
|         industry|total_salary|
+-----------------+------------+
|Quantum Computing|   380698224|
|               AI|   375475544|
|       Green Tech|   371831451|
|       Blockchain|   373310029|
+-----------------+------------+



In [32]:
spark.sql("""
    select industry, sum(annual_salary) as total_salary from jobs_demand
        group by industry
            order by total_salary desc
""").show()

+-----------------+------------+
|         industry|total_salary|
+-----------------+------------+
|Quantum Computing|   380698224|
|               AI|   375475544|
|       Blockchain|   373310029|
|       Green Tech|   371831451|
+-----------------+------------+



In [33]:
df_jobs_demand.groupBy(
    'company_size'
).count().show()

+------------+-----+
|company_size|count|
+------------+-----+
|      Medium| 3328|
|       Small| 3287|
|       Large| 3385|
+------------+-----+



In [34]:
spark.sql("""
    select company_size, count(*) as total_size
    from jobs_demand
        group by  company_size
""").show()

+------------+----------+
|company_size|total_size|
+------------+----------+
|      Medium|      3328|
|       Small|      3287|
|       Large|      3385|
+------------+----------+



In [35]:
df_jobs_demand = df_jobs_demand.withColumns({
    'monthly_salary': F.round((F.col('annual_salary') / 12), 2),
})


In [36]:
spark.sql("""
    select *, round((annual_salary / 12), 2) as monthly_salary from jobs_demand
""").createOrReplaceTempView('jobs_demand')

In [37]:
df_jobs_demand.filter(
    F.col('location') == 'London'
).groupBy(
    'location'
).agg(
    F.min('monthly_salary').alias('min_salary'),
    F.max('monthly_salary').alias('max_salary')
).show()

+--------+----------+----------+
|location|min_salary|max_salary|
+--------+----------+----------+
|  London|    4181.0|  20823.75|
+--------+----------+----------+



In [38]:
spark.sql("""
    select
        location,
        round((min(monthly_salary)), 2) as min_salary,
        round((max(monthly_salary)), 2) as max_salary
    from
        jobs_demand
    where
        location = 'London'
    group by
        location

""").show(5)

+--------+----------+----------+
|location|min_salary|max_salary|
+--------+----------+----------+
|  London|    4181.0|  20823.75|
+--------+----------+----------+



In [39]:
df_jobs_demand.filter(
    F.col('skills_required').contains('Python')
).count()

1657

In [40]:
spark.sql("""
    SELECT
        *
    FROM
        jobs_demand
    WHERE
        INSTR(skills_required, 'Python') > 0
""").count()

1657

In [41]:
percentiles = df_jobs_demand.select(
    F.percentile_approx('annual_salary', 0.33).alias('p33'),
    F.percentile_approx('annual_salary', 0.66).alias('p66')
).collect()[0]

low_threshold = percentiles['p33']
high_threshold = percentiles['p66']

df_jobs_demand.withColumn(
    'salary_category',
    F.when(F.col('annual_salary') > high_threshold, 'High')
    .when(F.col('annual_salary') > low_threshold, 'Medium')
    .otherwise('Low')
).show(5)

+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+---------------+
|job_id|           job_title|         industry| location|annual_salary|     skills_required|remote_option|company_size|posting_date|monthly_salary|salary_category|
+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+---------------+
|     1|  Quantum Researcher|Quantum Computing|Singapore|       175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|      14648.33|         Medium|
|     2|Renewable Energy ...|       Green Tech|Singapore|       137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|      11456.75|         Medium|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|       182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|      15173.42|           High|
|     4|Sustaina

In [42]:
df_jobs_demand.withColumn(
  'salary_category',
  F.ntile(3).over(Window.orderBy('annual_salary'))
).withColumn(
  'salary_category',
  F.when(F.col('salary_category') == 1, 'Low')
   .when(F.col('salary_category') == 2, 'Medium')
   .otherwise('High')
).show()

+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+---------------+
|job_id|           job_title|         industry| location|annual_salary|     skills_required|remote_option|company_size|posting_date|monthly_salary|salary_category|
+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+---------------+
|  3642|Quantum Software ...|Quantum Computing|    Dubai|        50013|Qiskit, Quantum A...|          Yes|       Small|  2025-05-08|       4167.75|            Low|
|  8042|Blockchain Developer|       Blockchain| New York|        50060|  Solidity, Ethereum|          Yes|       Small|  2025-03-18|       4171.67|            Low|
|  8938|       ML Researcher|               AI| New York|        50060| PyTorch, TensorFlow|           No|       Large|  2025-07-11|       4171.67|            Low|
|  2227|      Da

25/12/19 13:12:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [43]:
spark.sql("""
    SELECT *,
        CASE ntile(3) OVER (ORDER BY annual_salary)
            WHEN 1 THEN 'Low'
            WHEN 2 THEN 'Medium'
            ELSE 'High'
        END AS salary_category
    FROM jobs_demand
""").show()

+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+-------------+--------------+---------------+
|job_id|           job_title|         industry| location|salary_usd|     skills_required|remote_option|company_size|posting_date|annual_salary|monthly_salary|salary_category|
+------+--------------------+-----------------+---------+----------+--------------------+-------------+------------+------------+-------------+--------------+---------------+
|  3642|Quantum Software ...|Quantum Computing|    Dubai|     50013|Qiskit, Quantum A...|          Yes|       Small|  2025-05-08|        50013|       4167.75|            Low|
|  8042|Blockchain Developer|       Blockchain| New York|     50060|  Solidity, Ethereum|          Yes|       Small|  2025-03-18|        50060|       4171.67|            Low|
|  8938|       ML Researcher|               AI| New York|     50060| PyTorch, TensorFlow|           No|       Large|  2025-07

25/12/19 13:12:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/19 13:12:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [44]:
df_jobs_demand = df_jobs_demand.withColumn(
    'month',
    F.month(F.col('posting_date'))
)

In [47]:
spark.sql("""
    select *, month(posting_date) from jobs_demand
""").createOrReplaceTempView('jobs_demand')

In [50]:
df_jobs_demand.show(5)

+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+-----+
|job_id|           job_title|         industry| location|annual_salary|     skills_required|remote_option|company_size|posting_date|monthly_salary|month|
+------+--------------------+-----------------+---------+-------------+--------------------+-------------+------------+------------+--------------+-----+
|     1|  Quantum Researcher|Quantum Computing|Singapore|       175780|Linear Algebra, Q...|           No|       Large|  2025-07-22|      14648.33|    7|
|     2|Renewable Energy ...|       Green Tech|Singapore|       137481|Climate Data Anal...|          Yes|       Large|  2025-09-26|      11456.75|    9|
|     3|  Quantum Researcher|Quantum Computing|    Tokyo|       182081|Linear Algebra, Q...|           No|      Medium|  2025-12-31|      15173.42|   12|
|     4|Sustainability An...|       Green Tech|Singapore|       113822|Clima

In [56]:
df_jobs_demand.groupBy(
    'remote_option'
).agg(
    F.round(F.avg('annual_salary'), 2).alias('avg_salary')
).show()

+-------------+----------+
|remote_option|avg_salary|
+-------------+----------+
|           No| 150469.45|
|          Yes| 149805.42|
+-------------+----------+



In [65]:
spark.sql("""
    SELECT
        remote_option,
        round(avg(annual_salary), 2) AS avg_salary
    FROM jobs_demand
        GROUP BY remote_option
        ORDER BY avg_salary DESC
""").show()

+-------------+----------+
|remote_option|avg_salary|
+-------------+----------+
|           No| 150469.45|
|          Yes| 149805.42|
+-------------+----------+

