# RDD

In [1]:
%pyspark

rdd = sc.parallelize([(1,2), (3,4), (3,6), (4,5), (3, 4), (1, 5), (4, 1)])

result_rdd = rdd \
    .reduceByKey(lambda x, y: (x + y))

result_rdd.take(10)

In [2]:
%pyspark

lines = sc.parallelize([
    "a ab abc",
    "a ac abc",
    "b b ab abc"
    ])

counts = lines.flatMap(lambda x: x.split(' ')) \
    .map(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x + y))
   
# дополнить код, чтобы получился rdd из пар (слово, частота)

output = counts.collect()

for (word, count) in output:
    print("%s: %i" % (word, count))

# market.events

In [4]:
%pyspark

from pyspark.sql.functions import regexp_extract

events_df = spark.table("market.events")

events_cat_df = events_df \
    .withColumn("cat_1", regexp_extract("category_code", "([a-z_]*)?.?([a-z_]*)?.?([a-z_]*)?", 1) ) \
    .withColumn("cat_2", regexp_extract("category_code", "([a-z_]*)?.?([a-z_]*)?.?([a-z_]*)?", 2) ) \
    .withColumn("cat_3", regexp_extract("category_code", "([a-z_]*)?.?([a-z_]*)?.?([a-z_]*)?", 3) ) \

events_cat_df.show()

In [6]:
%pyspark

from pyspark.sql.functions import col, count, row_number
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("cat_2").orderBy(col("views").desc())

events_rank_df = events_cat_df \
    .where(col("brand") != 'null') \
    .groupBy("cat_1", "cat_2", "brand").agg(count("*").alias("views")) \
    .withColumn("rank", row_number().over(windowSpec)) \
    .where(col("rank") <= 3) \
    .orderBy("cat_2", col("views").desc())
    
events_rank_df.show()    


# Датасет с треками
### !!! Внесены небольшие изменения, в частности, сохранена промежуточная таблица (без explode)

In [8]:
%pyspark

import pyspark.sql.functions as f
from pyspark.sql.types import *

sch=ArrayType(StringType());

# важно что разделитель ', ' с пробелом, иначе пробелы добавятся в значения
tracks = spark.read.option("header", "true") \
        .option("escape", '"') \
        .option("InferSchema", "true") \
        .csv("/datasets/tracks.csv") \
        .withColumn("release_year", f.substring("release_date", 1, 4).cast(IntegerType())) \
        .withColumn("array_artist", f.split(f.regexp_replace(f.col("artists"), "[\]\[\']", ""),", ")) \
        .cache() #выделяем год в отдельную колонку и преобразуем колонку с артистами в массив

tracks.write.mode("overwrite").saveAsTable("user_id.tracks")

tracks_exp = tracks.select(  
                            "name", 
                            "popularity",
                            "danceability",
                            "energy",
                            "speechiness",
                            "acousticness",
                            "liveness",
                            "valence",
                            "release_year",
                            "artists",
                            f.explode(f.col("array_artist") ).alias("name_artist")
                        ) #создаем отдельную таблицу с развернутым массивом артистов
                        
tracks_exp.printSchema()

tracks_exp.write.mode("overwrite").saveAsTable("user_id.tracks_exp")

In [9]:
%pyspark

tracks = spark.table("hw_3.tracks")
z.show(tracks)


In [10]:
%pyspark


window = Window.orderBy(F.col('popularity').desc()).partitionBy('release_year')

top_100 = (tracks
        .where(F.col('popularity') > 0)
        .withColumn('rank', F.rank().over(window))
        .where(F.col('rank') < 101)
        )

tracks_agg = (top_100
            .groupBy('name_artist')
            .agg(F.count('*').alias('count'))
            .orderBy(F.col('count').desc())
    )

tracks_agg.show()



In [11]:
%pyspark

from pyspark.sql.functions import col, row_number, explode
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("release_year").orderBy(col("popularity").desc(), "name")

tracks = spark.table("user_id.tracks")

#.select("name", "popularity", "release_year", "array_artist") \
top_100_tracks = tracks \
    .dropDuplicates() \
    .withColumn("rn", row_number().over(windowSpec)) \
    .where(col("rn") <= 100) \
    .orderBy("release_year", "rn") 
    
top_100_tracks_exp = top_100_tracks \
    .select("release_year", "array_artist", explode(col("array_artist")).alias("name_artist")) \
    .cache()
    
top_100_tracks_exp.count()

In [12]:
%pyspark

from pyspark.sql.functions import col

top_100_tracks_by_artists_by_countity = top_100_tracks_exp \
    .groupBy("name_artist").count() \
    .orderBy(col("count").desc())
    
top_100_tracks_by_artists_by_countity.show() 


In [13]:
%pyspark
   
from pyspark.sql.functions import col 

top_100_tracks_by_artists_by_rate = top_100_tracks_exp \
    .select("name_artist", "release_year").distinct() \
    .groupBy("name_artist").count() \
    .orderBy(col("count").desc())    
    
top_100_tracks_by_artists_by_rate.show()      


In [14]:
%pyspark

# учтем тот факт, что если пронумеровать отсортированные года, которые сгруппированны по исполнителю, 
# а затем от года отнять соответствующий номер, то разница между ними, для подряд идущих годов, даст ровно одну группу

from pyspark.sql.functions import col, row_number, count, max
from pyspark.sql.window import Window

windowRn = Window.partitionBy("name_artist").orderBy("release_year")  
    
top_100_tracks_by_artists_by_year_1 = top_100_tracks_exp \
    .select("name_artist", "release_year").distinct() \
    .withColumn("rn", row_number().over(windowRn)) \
    .withColumn("dif", col("release_year") - col("rn")) \
    .groupBy("name_artist", "dif").agg(count("*").alias("cnt")) \
    .groupBy("name_artist").agg(max("cnt").alias("mx")) \
    .orderBy(col("mx").desc()) 

top_100_tracks_by_artists_by_year_1.show()

In [15]:
%pyspark

# суть решения в том, чтобы передать года выпуска треков по одному артисту в виде списка
# далее отсортировать список, пройтисть по каждому элементу, и если он отличася от предудущего более чем на 1, добавить разделитель
# затем разбить список на несколько по разделителю и определить, какой их новых списков максимального размера

from pyspark.sql.functions import  col, collect_list, udf
from pyspark.sql.types import IntegerType

def getChainLen(years):

    years.sort()
    sep_years = []
    
    for x in years:
        if len(sep_years) != 0 and (int(sep_years[-1]) + 1) != x:
            sep_years.append(str('s'))
            sep_years.append(str(x))
        else:
            sep_years.append(str(x))

    sep_years = " ".join(sep_years).split('s') 

    y = 0
    for x in sep_years:
        x = x.strip().split()
        y = len(x) if len(x) > y else y
    
    return y
    
getChainLenUDF = udf(lambda z:getChainLen(z), IntegerType())    
    
top_100_tracks_by_artists_by_year_2 = top_100_tracks_exp \
    .select("name_artist", "release_year").distinct() \
    .groupBy("name_artist").agg(collect_list("release_year").alias("years")) \
    .withColumn("cnt", getChainLenUDF(col("years"))) \
    .select("name_artist", "cnt") \
    .orderBy(col("cnt").desc(), "name_artist")
    
top_100_tracks_by_artists_by_year_2.show()    


In [16]:
%pyspark
# https://stackoverflow.com/questions/56384625/pyspark-cumulative-sum-with-reset-condition

from pyspark.sql.functions import col, lag, lit, sum, max, when
from pyspark.sql.window import Window

windowLag = Window.partitionBy("name_artist").orderBy(col("release_year"))
windowGrp = Window.orderBy("name_artist", "release_year")

top_100_tracks_by_artists_by_year_3 = top_100_tracks_exp \
    .select("name_artist", "release_year").distinct() \
    .withColumn("lag", lag("release_year").over(windowLag)) \
    .withColumn("dif", col("release_year") - col("lag")) \
    .withColumn("tag", when(col("dif") != 1, lit(None)).otherwise(col("dif"))) \
    .withColumn("grp", sum((col("tag").isNull()).cast("int")).over(windowGrp)) \
    .groupBy("name_artist", "grp").count() \
    .groupBy("name_artist").agg(max(col("count")).alias("cnt")) \
    .orderBy(col("cnt").desc(), "name_artist")

top_100_tracks_by_artists_by_year_3.show()


In [17]:
%pyspark

from pyspark.sql.functions import expr, col, avg, round

tracks = spark.table("user_id.tracks_exp").dropDuplicates()
    
##########################################################

year_means = tracks \
    .select("release_year", "danceability", "energy", "speechiness", "acousticness", "liveness", "valence") \
    .groupBy("release_year") \
    .agg( \
        round(avg("danceability"), 2).alias("danceability_mean"), \
        round(avg("energy"), 2).alias("energy_mean"), \
        round(avg("speechiness"), 2).alias("speechiness_mean"), \
        round(avg("acousticness"), 2).alias("acousticness_mean"), \
        round(avg("liveness"), 2).alias("liveness_mean"), \
        round(avg("valence"), 2).alias("valence_mean"))
   
year_means.show()

##########################################################

tracks_advanced = tracks.join(year_means, "release_year") \
    .withColumn("norm_danceability_mean", round(col("danceability")/col("danceability_mean"), 2)) \
    .withColumn("norm_energy_mean", round(col("energy")/col("energy_mean"), 2)) \
    .withColumn("norm_speechiness_mean", round(col("speechiness")/col("speechiness_mean"), 2)) \
    .withColumn("norm_acousticness_mean", round(col("acousticness")/col("acousticness_mean"), 2)) \
    .withColumn("norm_liveness_mean", round(col("liveness")/col("liveness_mean"), 2)) \
    .withColumn("norm_valence_mean", round(col("valence")/col("valence_mean"), 2))

tracks_advanced.show()

##########################################################

unpivot_expr = "stack(6, 'norm_danceability_mean', norm_danceability_mean, 'norm_energ_meany', norm_energy_mean, 'norm_speechiness_mean', norm_speechiness_mean, \
    'norm_acousticness_mean', norm_acousticness_mean, 'norm_liveness_mean', norm_liveness_mean, 'norm_valence_mean', norm_valence_mean) as (characteristic, val)"

characteristics = ["norm_danceability_mean", "norm_energ_meany", "norm_speechiness_mean", "norm_acousticness_mean", "norm_liveness_mean", "norm_valence_mean"]

tracks_advanced_unpivot = tracks_advanced \
    .select("name_artist", expr(unpivot_expr))    
    
for item in characteristics:
    
    top_by_characteristic = tracks_advanced_unpivot \
        .where(col("characteristic") == item) \
        .groupBy("name_artist").agg(avg("val").alias(item)) \
        .orderBy(col(item).desc()) 
        
    top_by_characteristic.show(3)
