In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '6')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
df = spark.read.format('csv')\
    .option("header", "false")\
    .option("sep", "\t")\
    .option("mode", "DROPMALFORMED")\
    .option("inferSchema", "true")\
    .load("/data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv")\
    .withColumnRenamed("_c0", "userHash")\
    .withColumnRenamed("_c1", "artistMBID")\
    .withColumnRenamed("_c2", "artistName")\
    .withColumnRenamed("_c3", "listenCount")
df.write.mode("overwrite").save("/data/lastfm-dataset-360K/data.parquet")

In [3]:
users = df.select(df.userHash).distinct()
users = users.rdd.zipWithIndex().toDF()
users = users.select(\
    users._1.userHash.alias("userHash"),\
    users._2.alias("userId").cast("integer"))
users.write.mode("overwrite").save("/data/lastfm-dataset-360K/users.parquet")

In [4]:
artists = df.select(df.artistName).distinct()
artists = artists.rdd.zipWithIndex().toDF()
artists = artists.select(\
    artists._1.artistName.alias("artistName"),\
    artists._2.alias("artistId").cast("integer"))
artists.write.mode("overwrite").save("/data/lastfm-dataset-360K/artists.parquet")

In [5]:
df = spark.read.load("/data/lastfm-dataset-360K/data.parquet")
users = spark.read.load("/data/lastfm-dataset-360K/users.parquet")
artists = spark.read.load("/data/lastfm-dataset-360K/artists.parquet")

In [6]:
dfFinal = df\
    .join(users, df.userHash==users.userHash, 'inner')\
    .join(artists, df.artistName==artists.artistName, 'inner')\
    .select(
        df.userHash,\
        df.artistName,\
        users.userId,\
        users.userHash.alias("users-userHash"),\
        artists.artistId.alias("artists-artistId"),\
        artists.artistName.alias("artists-artistName"),\
        df.listenCount.cast("float"))
dfFinal.write.mode("overwrite").save("/data/lastfm-dataset-360K/data-final.parquet")

In [7]:
train, test = df.randomSplit([0.7, 0.3])
train.write.mode("overwrite").save("/data/lastfm-dataset-360K/data-train.parquet")
test.write.mode("overwrite").save("/data/lastfm-dataset-360K/data-test.parquet")

In [8]:
dfx = spark.read.load("/data/lastfm-dataset-360K/data-final.parquet")
dfx.count()   

17559516

In [9]:
dfx.printSchema()

root
 |-- userHash: string (nullable = true)
 |-- artistName: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- users-userHash: string (nullable = true)
 |-- artists-artistId: integer (nullable = true)
 |-- artists-artistName: string (nullable = true)
 |-- listenCount: float (nullable = true)



In [10]:
dfx.filter(col("userHash")==col('users-userHash')).count()   

17559516

In [11]:
dfx.filter(dfx.userHash=="f37640d943bf48a0ae42058733dd18e48d66f337").show()

+--------------------+--------------------+------+--------------------+----------------+--------------------+-----------+
|            userHash|          artistName|userId|      users-userHash|artists-artistId|  artists-artistName|listenCount|
+--------------------+--------------------+------+--------------------+----------------+--------------------+-----------+
|f37640d943bf48a0a...|   infected mushroom| 54113|f37640d943bf48a0a...|          239708|   infected mushroom|      330.0|
|f37640d943bf48a0a...|      chase & status| 54113|f37640d943bf48a0a...|           44033|      chase & status|      323.0|
|f37640d943bf48a0a...|       high contrast| 54113|f37640d943bf48a0a...|          260351|       high contrast|      298.0|
|f37640d943bf48a0a...|            syncopix| 54113|f37640d943bf48a0a...|           83493|            syncopix|      217.0|
|f37640d943bf48a0a...|           aphrodite| 54113|f37640d943bf48a0a...|          172615|           aphrodite|      214.0|
|f37640d943bf48a0a...|  

In [12]:
artists = spark.read.load("/data/lastfm-dataset-360K/artists.parquet")
dfFiltered = dfx.filter(dfx.userId==55177).collect()
for v in dfFiltered:
    print(v[4])
    artists.filter(artists.artistId==v[4]).show()

229453
+----------+--------+
|artistName|artistId|
+----------+--------+
|aphex twin|  229453|
+----------+--------+

184698
+----------+--------+
|artistName|artistId|
+----------+--------+
|       eod|  184698|
+----------+--------+

143520
+----------+--------+
|artistName|artistId|
+----------+--------+
|  autechre|  143520|
+----------+--------+

182764
+----------+--------+
|artistName|artistId|
+----------+--------+
|     björk|  182764|
+----------+--------+

264662
+----------+--------+
|artistName|artistId|
+----------+--------+
|  daedelus|  264662|
+----------+--------+

76296
+----------+--------+
|artistName|artistId|
+----------+--------+
|   under17|   76296|
+----------+--------+

83487
+----------+--------+
|artistName|artistId|
+----------+--------+
|prefuse 73|   83487|
+----------+--------+

283977
+----------+--------+
|artistName|artistId|
+----------+--------+
|    gescom|  283977|
+----------+--------+

201798
+----------+--------+
|artistName|artistId|
+------