In [7]:
import os
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [8]:
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [9]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

In [10]:
def normalization(data, key1, key2, field, threshold): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    top = data.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= threshold) \
        .drop(col("row_number")) 
        
    df = top.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedData = top.join(df, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedData

In [12]:
GreenDay = data.groupBy(col("userId"), col("artistId")).count()

GreenDayNorm = normalization(GreenDay, "userId", "artistId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("artistId")) \
        .withColumn("norm_count", col("norm_count")) \
        .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count").desc())
    
GreenDayList = GreenDayNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 100)\
    .sort(["norm_count","id","id2"],ascending=[0,1,1])\
    .select(col("id"), col("id2")).take(40)

In [13]:
for hit in GreenDayList:
    print "%s %s" % hit

66 993426
116 974937
128 1003021
131 983068
195 997265
215 991696
235 990642
288 1000564
300 1003362
321 986172
328 967986
333 1000416
346 982037
356 974846
374 1003167
428 993161
431 969340
445 970387
488 970525
542 969751
612 987351
617 970240
649 973851
658 973232
662 975279
698 995788
708 968848
746 972032
747 972032
776 997265
784 969853
806 995126
811 996436
837 989262
901 988199
923 977066
934 990860
957 991171
989 975339
999 968823
