In [12]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '6')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [13]:
df = spark.read.format('csv')\
    .option("header", "false")\
    .option("sep", "\t")\
    .option("mode", "DROPMALFORMED")\
    .option("inferSchema", "true")\
    .load("/data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv")\
    .withColumnRenamed("_c0", "userHash")\
    .withColumnRenamed("_c1", "artistMBID")\
    .withColumnRenamed("_c2", "artistName")\
    .withColumnRenamed("_c3", "listenCount")
    
df.printSchema()
df.show(3)


root
 |-- userHash: string (nullable = true)
 |-- artistMBID: string (nullable = true)
 |-- artistName: string (nullable = true)
 |-- listenCount: double (nullable = true)

+--------------------+--------------------+-----------------+-----------+
|            userHash|          artistMBID|       artistName|listenCount|
+--------------------+--------------------+-----------------+-----------+
|00000c289a1829a80...|3bd73256-3905-4f3...|  betty blowtorch|     2137.0|
|00000c289a1829a80...|f2fb0ff0-5679-42e...|        die Ärzte|     1099.0|
|00000c289a1829a80...|b3ae82c2-e60b-455...|melissa etheridge|      897.0|
+--------------------+--------------------+-----------------+-----------+
only showing top 3 rows



In [28]:
users = df.select(df.userHash).distinct()
users = users.rdd.zipWithIndex().toDF()
users = users.select(\
    users._1.userHash.alias("userHash"),\
    users._2.alias("userId").cast("integer"))

users.printSchema()
users.show(3)

root
 |-- userHash: string (nullable = true)
 |-- userId: integer (nullable = true)

+--------------------+------+
|            userHash|userId|
+--------------------+------+
|68bddc5a95f7aee26...|     0|
|68bf3ef85f3d8bf5d...|     1|
|68c12efba3b696a12...|     2|
+--------------------+------+
only showing top 3 rows



In [27]:
artists = df.select(df.artistName).distinct()
artists = artists.rdd.zipWithIndex().toDF()
artists = artists.select(\
    artists._1.artistName.alias("artistName"),\
    artists._2.alias("artistId").cast("integer"))

artists.printSchema()
artists.show(3)

root
 |-- artistName: string (nullable = true)
 |-- artistId: integer (nullable = true)

+----------+--------+
|artistName|artistId|
+----------+--------+
|  godsmack|       0|
|   embrace|       1|
| kate bush|       2|
+----------+--------+
only showing top 3 rows



In [29]:
df = df\
    .join(users, df.userHash==users.userHash, 'inner')\
    .join(artists, df.artistName==artists.artistName, 'inner')\
    .select(users.userId, artists.artistId, df.listenCount.cast("float"))

df.printSchema()
df.show(3)

root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)

+------+--------+-----------+
|userId|artistId|listenCount|
+------+--------+-----------+
|235047|    1180|       33.0|
|  4396|     386|      115.0|
|168567|    1208|       63.0|
+------+--------+-----------+
only showing top 3 rows



In [30]:
df.write.mode("overwrite").save("/data/lastfm-dataset-360K/data.parquet")
artists.write.mode("overwrite").save("/data/lastfm-dataset-360K/artists.parquet")
users.write.mode("overwrite").save("/data/lastfm-dataset-360K/users.parquet")

In [31]:
train, test = df.randomSplit([0.7, 0.3])

train.write.mode("overwrite").save("/data/lastfm-dataset-360K/data-train.parquet")
test.write.mode("overwrite").save("/data/lastfm-dataset-360K/data-test.parquet")