In [1]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local[2]").getOrCreate()

In [2]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

In [3]:
from pyspark.sql import Window
from pyspark.sql.functions import *

def norm(df, key1, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [6]:
data.cache()

track_win = Window.partitionBy("userId").orderBy(col("timestamp"))

track_to_track = data \
    .withColumn("prev_timestamp", lag("timestamp", 1).over(track_win)) \
    .withColumn("prev_trackId", lag("trackId", 1).over(track_win)) \
    .filter(col("prev_trackId").isNotNull()) \
    .withColumn("time_diff", col("timestamp")-col("prev_timestamp")) \
    .filter(col("time_diff") <= 7*60) \
    .groupBy(col("prev_trackId"), col("trackId")).count()

normalized = norm(track_to_track, "prev_trackId", "count", 40) \
    .withColumn("id", column("prev_trackId")) \
    .withColumn("id2", column("trackId")) \
    .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count").desc())

top = normalized.withColumn("pos", rank().over(window)) \
    .filter(col("pos") <= 40) \
    .orderBy(col("id").asc(), col("id2").asc()) \
    .select(col("id"), col("id2")) \
    .take(40)

    
for item in top:
    print "%s %s" % item
    




798256 923706
798258 808254
798290 906999
798302 836228
798311 903496
798322 876562
798331 827364
798335 840741
798375 798375
798376 888871
798379 812055
798396 798396
798398 926302
798403 868805
798405 867217
798426 910880
798447 832635
798457 918918
798471 801831
798474 963162
798475 827475
798493 899168
798505 905671
798508 810743
798516 860347
798526 937573
798542 946408
798544 841232
798550 936295
798552 830267
798579 821762
798667 874844
798682 934393
798704 937570
798707 839389
798720 958333
798725 933147
798731 853117
798782 956938
798797 967698
