In [1]:
from pyspark.sql import (
    functions as f,
    Row,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_interviewed").getOrCreate()

In [2]:
csv_file_path = "file:///home/jovyan/work/like.csv"

table_schema = t.StructType([
    t.StructField("interviewer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

df = spark.read.schema(table_schema).csv(csv_file_path)
df.show()

+--------------+-------------+------+
|interviewer_id|occupation_id|rating|
+--------------+-------------+------+
|         11657|         1100|     8|
|         13727|         2030|     2|
|         59892|         3801|     1|
|          6538|         3021|     6|
|         95811|         2030|     9|
|         54500|         1100|    10|
|         69741|         2030|     3|
|         51166|         2030|    10|
|         70009|         9382|     5|
|         63152|         2030|     6|
|         70758|         1100|     2|
|         35580|         2030|     5|
|         63199|         1100|    10|
|         33078|         2030|     3|
|         97480|         9382|     2|
|         47223|         1100|     8|
|         80308|         3021|     8|
|         26691|         1100|     3|
|         17194|         3021|     3|
|         96584|         2030|     4|
+--------------+-------------+------+
only showing top 20 rows



In [3]:
interviewer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))
interviewer_count.show()

+-------------+-----+
|occupation_id|count|
+-------------+-----+
|         1100|  217|
|         3801|  203|
|         2030|  200|
|         3021|  191|
|         9382|  189|
+-------------+-----+



In [4]:
meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painder",
    "3021": "chemistry teacher",
    "9382": "priest"
}

In [10]:
%%time
occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)
occupation_with_name = interviewer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))
occupation_with_name.show()

+-------------+-----+-----------------+
|occupation_id|count|  occupation_name|
+-------------+-----+-----------------+
|         1100|  217|         engineer|
|         3801|  203|          painder|
|         2030|  200|        developer|
|         3021|  191|chemistry teacher|
|         9382|  189|           priest|
+-------------+-----+-----------------+

CPU times: user 6.18 ms, sys: 5.18 ms, total: 11.4 ms
Wall time: 221 ms
