In [0]:
%pyspark

# вывести в виде
# +--------------------------------+----------+
# |category_code                   |view_count|
# +--------------------------------+----------+
# |null                            |13236458  |
# |electronics.smartphone          |10619448  |
# |electronics.clocks              |1272783   |
# |computers.notebook              |1106406   |

from pyspark.sql.functions import col
from pyspark.sql.functions import count

market_events = spark.table("user_id.market_events")
aggregeted_df = market_events \
    .groupBy("category_code").agg(count("*").alias("view_count")) \
    .orderBy(col("view_count").desc())

aggregeted_df.show()

In [1]:
%pyspark

# результат визуализировать через z.show()

from pyspark.sql.functions import col, round, countDistinct

market_events = spark.table("user_id.market_events")

daily_10 = market_events \
    .where(col("event_date") == "2019-10-10") \
    .withColumn("bin", round("price", -1)) \
    .groupBy("bin").agg(countDistinct("product_id").alias("cnt")) \
    .orderBy(col("bin"))

z.show(daily_10)


In [2]:
%pyspark

# вывести через df.show() в виде
# +--------+------+
# |is_apple| count|
# +--------+------+
# |    true| *****|
# |   false| *****|
# +--------+------+

from pyspark.sql.functions import col, expr, when, count

apple_events = spark.table("user_id.market_events") \
    .where(expr("event_time between '2019-10-01' and '2019-11-01'")) \
    .withColumn("is_apple", when(col("brand") == "apple", "true").otherwise("false")) \
    .withColumn("col_order", when(col("brand") == "apple", 1).otherwise(2)) \
    .groupBy("col_order", "is_apple").agg(count("*").alias("count")) \
    .orderBy("col_order") \
    .select("is_apple", "count")

apple_events.show()


In [3]:
%pyspark

# z.show(), ключ -- часы, значения -- число продаж и сумма прибыли за этот час

from pyspark.sql.functions import date_format, count, sum, expr, round, date_trunc, lit, col

# hourly_count_graph - исключительно для наглядности при построении графика

hourly_sales = spark.table("user_id.market_events") \
    .where(expr("event_time between '2019-10-07' and '2019-10-14'")) \
    .withColumn("hour_id", date_trunc("hour", "event_time")) \
    .groupBy("hour_id").agg(count("*").alias("hourly_count"), sum("price").alias("hourly_sum")) \
    .select("hour_id", "hourly_count", "hourly_sum") \
    .withColumn("hourly_count_graph", col("hourly_count") * lit(100)) \
    .orderBy("hour_id")

z.show(hourly_sales)

In [4]:
%pyspark

# z.show(), ключ -- час в диапазоне от 0 до 23, значение -- усредненное за месяц число продаж в этот час на месячных данных

from pyspark.sql.functions import expr, hour, count, date_trunc, avg, sum, col, lit

# avg_count_daily_graph - исключительно для наглядности при построении графика

hourly_sales = spark.table("user_id.market_events") \
    .where(expr("event_time between '2019-10-01' and '2019-11-01'")) \
    .withColumn("hour_monthly", date_trunc("hour", "event_time")) \
    .groupBy("hour_monthly").agg(count("*").alias("count_monthly"), sum("price").alias("sum_monthly")) \
    .withColumn("hour_daily", hour("hour_monthly")) \
    .groupBy("hour_daily").agg(avg("count_monthly").alias("avg_count_daily"), avg("sum_monthly").alias("avg_sum_t_daily")) \
    .withColumn("avg_count_daily_graph", col("avg_count_daily") * lit(100)) \
    .orderBy("hour_daily")

z.show(hourly_sales)