https://www.kaggle.com/datasets/ravindrasinghrana/job-description-dataset

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
import seaborn as sns
import matplotlib as plt
import altair as alt
import plotly.express as px

In [2]:
spark = (
    SparkSession.builder
    .appName("JobDescription")
    .config('spark.executor.memory', '16g')
    .config('spark.driver.memory', '16g')
    .config('spark.default.parallelism', '24')
    .config('spark.sql.shuffle.partitions', '24')
    .config('spark.dynamicAllocation.enabled', 'true')
    .config('spark.shuffle.service.enabled', 'true')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate())

In [3]:
df_path = r"F:\Datasets\CSV datasets\job_descriptions.csv"

In [4]:
df = spark.read.csv(df_path, inferSchema=True, header=True)

In [8]:
df = df.repartition(100, "Country")

df.cache()

df.groupBy(
    'Country', 'Country', 'Work Type'
).agg(
    F.sum('Company Size').alias('Company Size'),
    F.mean('Company Size').alias('Mean Company Size'),
    F.count('Company Size').alias('Count Company Size'),
    F.countDistinct('Company Size').alias('Distinct Company Size'),
    F.min('Company Size').alias('Min Company Size'),
    F.max('Company Size').alias('Max Company Size'),
    F.stddev('Company Size').alias('Stddev Company Size'),
    F.variance('Company Size').alias('Variance Company Size'),
).show(50, truncate=False)

+--------+--------+---------+------------+-----------------+------------------+---------------------+----------------+----------------+-------------------+---------------------+
|Country |Country |Work Type|Company Size|Mean Company Size|Count Company Size|Distinct Company Size|Min Company Size|Max Company Size|Stddev Company Size|Variance Company Size|
+--------+--------+---------+------------+-----------------+------------------+---------------------+----------------+----------------+-------------------+---------------------+
|Panama  |Panama  |Temporary|106845218   |73992.53324099723|1444              |1433                 |12646           |134770          |35704.54820972104  |1.274814762860294E9  |
|Panama  |Panama  |Contract |108832619   |73585.27315753888|1479              |1471                 |12802           |134832          |34800.12419859903  |1.2110486442379177E9 |
|Panama  |Panama  |Full-Time|107396580   |74997.61173184357|1432              |1425                 |12711    