In [1]:
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import hour, dayofmonth, weekofyear, month, year, date_format
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType
from datetime import datetime as dt

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1612381796611_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
input_data = 's3a://udacity-dend/'
output_data = 's3a://adzugaiev-sparkify/output/'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Process Song Data
The files are partitioned by the first three letters of each song's track ID.

In [5]:
# get filepath to song data file
song_data = input_data + 'song_data/*/*/*/*.json'

# define schema for song data
song_schema = StructType([
    StructField('song_id', StringType()),
    StructField('artist_id', StringType()),
    StructField('artist_name', StringType()),
    StructField('artist_location', StringType()),
    StructField('artist_latitude', DoubleType()),
    StructField('artist_longitude', DoubleType()),
    StructField('duration', DoubleType()),
    StructField('num_songs', IntegerType()),
    StructField('title', StringType()),
    StructField('year', IntegerType())
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
df = spark.read.json(song_data, schema = song_schema)
df.cache()
print('Songs records: {}'.format(df.count()))
df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songs records: 14896
+------------------+------------------+--------------------+--------------------+---------------+----------------+---------+---------+--------------------+----+
|           song_id|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude| duration|num_songs|               title|year|
+------------------+------------------+--------------------+--------------------+---------------+----------------+---------+---------+--------------------+----+
|SOVIYJY12AF72A4B00|AR4T2IF1187B9ADBB7|          Billy Idol|<a href="http://b...|       63.96027|        10.22442|233.22077|        1|The Dead Next Doo...|1983|
|SOVYXYL12AF72A3373|AR4T2IF1187B9ADBB7|          Billy Idol|<a href="http://b...|       63.96027|        10.22442|287.92118|        1|Rebel Yell (1999 ...|1983|
|SOEPTVC12A67ADD0DA|ARQ846I1187B9A7083|Yvonne S. Moriart...|                    |           null|            null|196.04853|        1|To Zucchabar ["Gl...|   0|
|SOLQYSZ12AB0

Songs Table, with files partitioned by year and then artist.

In [7]:
songs_cols = ['song_id', 'title', 'artist_id', 'year', 'duration']
songs_table = df.select(songs_cols).dropDuplicates(['song_id']).repartition('year', 'artist_id')
print('Songs count: {}'.format(songs_table.count()))
songs_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songs count: 14896
+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOBOYFA12A6701D128|Facepeeler (Album...|ARB9VHL1187B9AEC3F|2005|290.16771|
|SOKRZWV12A81C23FFB|         Ghali Alayi|AR35VZA1187B9ABCF1|   0|215.71873|
|SOOYQEG12AC468F2FC|Yes Love_ My Soul...|ARNCTJ91187B98D813|   0| 75.62404|
|SOUNJPE12A58A78393|        Just As I Am|ARTL0JQ1187FB4D190|1985|283.16689|
|SOKUOAK12A6D4FD5E9|           The Music|ARI29D81187B9B096F|2007|216.78975|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows

In [8]:
songs_table.limit(5).write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Artists Table

In [9]:
artists_cols = ['artist_id',
                'artist_name as name',
                'artist_location as location',
                'artist_latitude as latitude',
                'artist_longitude as longitude']

# using selectExpr() because of 'as' column aliases present
artists_table = df.selectExpr(artists_cols).dropDuplicates(['artist_id', 'name'])
print('Artists count: {}'.format(artists_table.count()))
artists_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Artists count: 9993
+------------------+--------------------+--------+--------+---------+
|         artist_id|                name|location|latitude|longitude|
+------------------+--------------------+--------+--------+---------+
|AR1OGXT1187B9893EB|Lester Flatt / Ea...|        |    null|     null|
|AR2TN021187B998B29|            Declaime|        |    null|     null|
|AR2UXTA1187B98B0AF|   Theoretical Girls|        |    null|     null|
|AR43RZ01187FB5128D|         Lord Nelson|        |    null|     null|
|AR58CZ21187B9AF528|  WESTERNHAGEN (HCL)|        |    null|     null|
+------------------+--------------------+--------+--------+---------+
only showing top 5 rows

In [10]:
artists_table.limit(5).write.parquet(output_data + 'artists/') #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Process Log Data
The log files in the dataset you'll be working with are partitioned by year and month.

In [11]:
log_data = input_data + 'log-data/*/*/*.json'
df = spark.read.json(log_data).where("page = 'NextSong'")
df.cache()
print("'NextSong' events count: {}".format(df.count()))
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'NextSong' events count: 6820
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

Users Table

In [12]:
users_cols = ['userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level']
users_table = df.selectExpr(users_cols).dropDuplicates(['user_id'])
print('Users count: {}'.format(users_table.count()))
users_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Users count: 96
+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     51|      Maia|    Burke|     F| free|
|      7|    Adelyn|   Jordan|     F| free|
|     15|      Lily|     Koch|     F| paid|
|     54|     Kaleb|     Cook|     M| free|
|    101|    Jayden|      Fox|     M| free|
+-------+----------+---------+------+-----+
only showing top 5 rows

In [13]:
users_table.write.parquet(output_data + 'users/') #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time Table, with files partitioned by year and month.

In [15]:
to_timestamp = udf(lambda x : dt.utcfromtimestamp(x / 1e3), TimestampType())
df = df.withColumn('start_time', to_timestamp('ts'))

time_table = df.select('start_time').dropDuplicates() \
    .withColumn('hour', hour('start_time')) \
    .withColumn('day',  dayofmonth('start_time')) \
    .withColumn('week', weekofyear('start_time')) \
    .withColumn('month', month('start_time')) \
    .withColumn('year', year('start_time')) \
    .withColumn('weekday', date_format('start_time', 'E')) \
    .repartition('year', 'month')
print('Timestamps count: {}'.format(time_table.count()))
time_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Timestamps count: 6813
+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-05 12:15:...|  12|  5|  45|   11|2018|    Mon|
|2018-11-05 13:47:...|  13|  5|  45|   11|2018|    Mon|
|2018-11-05 14:52:...|  14|  5|  45|   11|2018|    Mon|
|2018-11-05 18:07:...|  18|  5|  45|   11|2018|    Mon|
|2018-11-05 18:18:...|  18|  5|  45|   11|2018|    Mon|
+--------------------+----+---+----+-----+----+-------+
only showing top 5 rows

In [16]:
time_table.limit(5).write.partitionBy('year', 'month').parquet(output_data + 'time/') #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songplays Table, with files partitioned by year and month.

In [17]:
# select specific columns to skip ambiguous ones
df = df.join(songs_table.select('song_id', 'title'), (df.song == songs_table.title)) \
       .join(artists_table.select('artist_id', 'name'), (df.artist == artists_table.name))

df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)

In [18]:
# songplay_id will be an autoincrementing column
# year and month will be added based on start_time
songplays_cols = ['start_time',
                  'userId as user_id',
                  'level',
                  'song_id',
                  'artist_id',
                  'sessionId as session_id',
                  'location',
                  'userAgent as user_agent']

songplays_table = df.selectExpr(songplays_cols).dropDuplicates(['start_time', 'user_id', 'session_id']) \
                    .withColumn('songplay_id', monotonically_increasing_id()) \
                    .withColumn('month', month('start_time')) \
                    .withColumn('year', year('start_time')) \
                    .repartition('year', 'month')

print('Songplays count: {}'.format(songplays_table.count()))
songplays_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songplays count: 638
+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+-----------+-----+----+
|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|songplay_id|month|year|
+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+-----------+-----+----+
|2018-11-12 23:13:...|     80| paid|SOSQIHH12A8C13370B|ARH6W4X1187B99274F|       481|Portland-South Po...|"Mozilla/5.0 (Mac...| 8589934592|   11|2018|
|2018-11-14 12:18:...|     29| paid|SOFDYYA12A58A78E73|AR8W8P31187B9A4063|       559|Atlanta-Sandy Spr...|"Mozilla/5.0 (Mac...| 8589934593|   11|2018|
|2018-11-18 19:24:...|     29| paid|SOCOWCL12A8C1415F7|AR8K3HD1187B9B9CA9|       589|Atlanta-Sandy Spr...|"Mozilla/5.0 (Mac...| 8589934594|   11|2018|
|2018-11-23 16:13:...|     58| paid|SOJJYDE12AF729FC16|ARG72Q21187FB36243

In [19]:
songplays_table.write.partitionBy('year', 'month').parquet(output_data + 'songplays/') #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…