In [0]:
import pyspark.sql.functions as f

events_df = spark.table("market.events").where("date < '2019-11-02'")

events_df.show()

In [2]:
import pyspark.sql.functions as f

split_cols = f.split('category_code', '\.')

arr_cols = [split_cols[i].alias('category_' + str(i+1)) for i in range(3)]

category_df = events_df.select("*", *arr_cols)
category_df.show()


In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col


window = Window.partitionBy(category_df['category_2']).orderBy(f.col('views').desc())

category_df.filter("event_type = 'view'") \
.filter(col("brand").isNotNull()) \
.groupby('category_1', 'category_2', 'brand') \
.agg(f.count('*').alias("views")) \
.withColumn("rank", rank().over(window)) \
.filter(col('rank') <= 3)  \
.show(100)




In [4]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

sch=ArrayType(StringType());

# скачать датасет: http://37.139.43.86/tracks 

# важно что разделитель ', ' с пробелом, иначе пробелы добавятся в значения
tracks = spark.read.option("header", "true") \
        .option("escape", '"') \
        .option("InferSchema", "true") \
        .csv("data/tracks.csv") \
        .withColumn("release_year", f.substring("release_date", 1, 4).cast(IntegerType())) \
        .withColumn("array_artist", f.split(f.regexp_replace(f.col("artists"), "[\]\[\']", ""),", ")) \
        .cache() #выделяем год в отдельную колонку и преобразуем колонку с артистами в массив

tracks_exp = tracks.select(  
                            "name", 
                            "popularity",
                            "danceability",
                            "energy",
                            "speechiness",
                            "acousticness",
                            "liveness",
                            "valence",
                            "release_year",
                            "artists",
                            f.explode(f.col("array_artist") ).alias("name_artist")
                        ) #создаем отдельную таблицу с развернутым массивом артистов
                        
tracks_exp.printSchema()

spark.sql("create database hw_3")
tracks_exp.write.mode("overwrite").saveAsTable("hw_3.tracks")

In [5]:
tracks = spark.table("hw_3.tracks")
tracks.show(100)

In [6]:
window = Window.partitionBy(f.col("release_year")).orderBy(f.col('popularity').desc())

top_100_yearly = tracks\
.select('name_artist', 'name', 'release_year', 'popularity') \
.withColumn("rank", rank().over(window))\
.filter((col('rank') <= 100) & (col('popularity') > 0)) 

top_100_yearly.groupby("name_artist") \
.count() \
.orderBy(f.col('count').desc()) \
.show()



In [7]:
top_100_yearly.show()

In [8]:
top_100_yearly.dropDuplicates(['name_artist', 'release_year'])\
.groupBy('name_artist')\
.count()\
.orderBy(f.col("count").desc())\
.show()


In [9]:
def max_consecutive(years_list):
    years_list.sort()
    count = 1
    max_count = 1
    con_list = []
    for i in range(len(years_list)-1):
        if years_list[i] + 1 == years_list[i+1]:
            count+=1
        else:
            max_count = max(max_count, count)
            count = 1
    max_count = max(max_count, count)
    return(max_count)

In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

udf_years=udf(lambda x: max_consecutive(x), IntegerType())

top_100_yearly.groupby('name_artist')\
.agg(f.collect_set(col('release_year')).alias('years_list'))\
.select('name_artist', udf_years(col('years_list')).alias('count'))\
.orderBy(col('count').desc())\
.show()