In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [0]:
pin_file_location = "/mnt/pinterest-data/topics/12a3410ba3cf.pin/partition=0//*.json"
file_type = "json"
infer_schema = "true"
df_pin = spark.read.format(file_type) \
                .option("inferSchema", infer_schema) \
                .load(pin_file_location)
geo_file_location = "/mnt/pinterest-data/topics/12a3410ba3cf.geo/partition=0//*.json"
df_geo = spark.read.format(file_type) \
                 .option("inferSchema", infer_schema) \
                 .load(geo_file_location)
user_file_location = "/mnt/pinterest-data/topics/12a3410ba3cf.user/partition=0//*.json"
df_user = spark.read.format(file_type) \
                  .option("inferSchema", infer_schema) \
                  .load(user_file_location)

In [0]:
df_pin_cleaned = df_pin.withColumn("category", when(df_pin["category"].isNull(), None).otherwise(df_pin["category"]))
df_pin_cleaned = df_pin.withColumn("description", when(df_pin["description"].isNull(), None).otherwise(df_pin["description"]))
df_pin_cleaned = df_pin_cleaned.withColumn(
    "follower_count",
    when(col("follower_count").endswith("k"), 
         regexp_replace(col("follower_count"), "k", "").cast("int") * 1000)
    .when(col("follower_count").endswith("M"), 
          regexp_replace(col("follower_count"), "M", "").cast("int") * 1000000)
    .otherwise(col("follower_count").cast("int"))
)
df_pin_cleaned = df_pin_cleaned.withColumn(
    "follower_count",
    when(
        df_pin_cleaned["follower_count"].cast("int").isNotNull(),
        df_pin_cleaned["follower_count"].cast("int")
    ).otherwise(None)
)
df_pin_cleaned.withColumn("downloaded",col("downloaded").cast("int"))
df_pin_cleaned = df_pin_cleaned.withColumn("save_location",regexp_replace(col("save_location"), "^Local save in ", ""))
df_pin_cleaned = df_pin_cleaned.withColumnRenamed("index", "ind")
desired_column_order = ["ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"]
df_pin_cleaned = df_pin_cleaned.select(desired_column_order)
df_pin_cleaned.withColumn("ind",col("ind").cast("int"))


In [0]:
df_geo_cleaned = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
df_geo_cleaned = df_geo_cleaned.drop("latitude", "longitude")
df_geo_cleaned = df_geo_cleaned.withColumn("timestamp", to_timestamp(col("timestamp")))
df_geo_cleaned = df_geo_cleaned.select("ind", "country", "coordinates", "timestamp")

In [0]:
df_user_cleaned = df_user.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))
df_user_cleaned = df_user_cleaned.drop("first_name", "last_name")
df_user_cleaned = df_user_cleaned.withColumn("date_joined", to_timestamp(col("date_joined"), "yyyy-MM-dd'T'HH:mm:ss"))
df_user_cleaned = df_user_cleaned.select("ind", "user_name", "age", "date_joined")

In [0]:
# Find the most popular Pinterest category people post to based on their country
df_pin_cleaned.createOrReplaceGlobalTempView("pin_df")
df_geo_cleaned.createOrReplaceGlobalTempView("geo_df")
df_user_cleaned.createOrReplaceGlobalTempView("user_df")
spark.sql("""
SELECT
   g.country,
   p.category,
   COUNT(*) AS category_count
FROM
   geo_df g
JOIN
   pin_df p
ON
   g.ind = p.ind
GROUP BY
   g.country,
   p.category
HAVING
   category_count = MAX(category_count)
""")