In [None]:
from pyspark.sql.functions import col, when, year, expr, first

Most popular category in each country:

In [None]:
by_country = (
    df_pin.join(df_geo, 'ind', 'inner')
    .groupBy('country', 'category')
    .count()
    .groupBy('country')
    .agg(
        first('category').alias('category'),
        max('count').alias('category_count')
        )
)

Most popular category by year (2018-2022):

In [None]:
by_year = (
    df_pin.join(df_geo, 'ind', 'inner')
    .withColumn('post_year', year(col('timestamp')))
    .filter(col('post_year').between(2018, 2022))
    .groupBy('post_year', 'category')
    .count()
    # .withColumnRenamed('count', 'Total Count')
    .groupBy('post_year')
    .agg(
        first('category').alias('category'),
        max('count').alias('category_count')
    )
    .orderBy('post_year', ascending=True)
)

Users with most followers per country:

In [None]:

most_followers = (
    df_pin.join(df_geo, 'ind', 'inner')
    .groupBy('country','poster_name')
    .agg(max('follower_count').alias('follower_count'))
    .orderBy('country', ascending=True)
)


Country with the most followed user:


In [None]:
country_most_followers = (
    most_followers.orderBy('follower_count', ascending=False)
    .select(col('country'),col('follower_count'))
    .limit(1)
)


Most popular category by age group:


In [None]:
pop_age_group = (
    df_pin.join(df_user,'ind','inner')
    .withColumn("age_group",
    when((col("age") >= 18) & (col("age") < 25), "18-24")
    .when((col("age") > 24) & (col("age") < 36), "25-35")
    .when((col("age") > 35) & (col("age") < 50), "36-49")
    .otherwise("50+")
    )
    .groupBy('age_group', 'category')
    .count()
    .groupBy('age_group')
    .agg(
        first('category').alias('category'),
        max('count').alias('category_count'))
    .orderBy('age_group')
)

Median follower count by age group:

In [None]:
fol_age_group = (
    df_pin.join(df_user,'ind','inner')
    .withColumn("age_group",
    when((col("age") >= 18) & (col("age") < 25), "18-24")
    .when((col("age") > 24) & (col("age") < 36), "25-35")
    .when((col("age") > 35) & (col("age") < 50), "36-49")
    .otherwise("50+")
    )
    .groupBy('age_group')
    .agg(
        expr("percentile_approx(follower_count, 0.5)")
        .alias('median_follower_count')
    )
    .orderBy('age_group')
)

How many users joined in 2015-2020:


In [None]:
join_year = (
    df_user.withColumn('year_joined',year(col("date_joined")))
    .filter(col('year_joined').between(2015, 2020))
    .groupBy('year_joined')
    .count()
    .withColumnRenamed('count', 'number_users_joined')
)


Median follower count by join year:


In [None]:
fol_by_year = (
    df_user.join(
        df_pin, 'ind', 'inner'
    )
    .withColumn('year_joined',year(col("date_joined")))
    # .filter(col('year_joined').between(2015, 2020))
    .groupBy('year_joined')
    .agg(
        expr("percentile_approx(follower_count, 0.5)").alias('median_follower_count')
    )
)


Median follower count by age group and year joined:

In [None]:
fol_age_year = (
    df_user.join(df_pin, 'ind', 'inner')
    .withColumn("age_group",
                when((col("age") >= 18) & (col("age") < 25), "18-24")
                .when((col("age") > 24) & (col("age") < 36), "25-35")
                .when((col("age") > 35) & (col("age") < 50), "36-49")
                .otherwise("50+")
    )
    .withColumn('year_joined',year(col("date_joined")))
    .groupBy('age_group', 'year_joined')
    .agg(
        expr("percentile_approx(follower_count, 0.5)").alias('median_follower_count')
    )
    .orderBy('age_group', 'year_joined')
)