In [1]:
import importlib
import configparser
from pyspark.sql.functions import udf, monotonically_increasing_id
from pyspark.sql.functions import hour, dayofmonth, weekofyear, month, year, date_format
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType
from datetime import datetime as dt

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1612863591590_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Pre-processing
Read the config file from S3, remove any previous output data from S3

In [None]:
# No module named 'boto3'?
if importlib.util.find_spec('boto3') is None:
    sc.install_pypi_package('boto3')

In [3]:
import boto3

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
bucket = boto3.resource('s3').Bucket('adzugaiev-sparkify')
dl_cfg = bucket.Object('input/dl.cfg').get()

cfg = configparser.ConfigParser() #https://stackoverflow.com/a/60232146
cfg.read_string(dl_cfg['Body'].read().decode())
data_song = cfg.get('DATA', 'data_song')
data_log = cfg.get('DATA', 'data_log')
data_output = cfg.get('DATA', 'data_output')
print(data_song, data_log, data_output)

key = {
    'songs'    : cfg.get('KEY', 'key_songs'),
    'artists'  : cfg.get('KEY', 'key_artists'),
    'users'    : cfg.get('KEY', 'key_users'),
    'time'     : cfg.get('KEY', 'key_time'),
    'songplays': cfg.get('KEY', 'key_songplays')
}

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

s3a://udacity-dend/song_data/*/*/*/*.json s3a://udacity-dend/log-data/*/*/*.json s3a://adzugaiev-sparkify/

Remove any previous output data from S3

In [5]:
obj_removed = 0
for obj in list(bucket.objects.filter(Prefix = key['songs'])) \
         + list(bucket.objects.filter(Prefix = key['artists'])) \
         + list(bucket.objects.filter(Prefix = key['users'])) \
         + list(bucket.objects.filter(Prefix = key['time'])) \
         + list(bucket.objects.filter(Prefix = key['songplays'])):
    _ = obj.delete()
    obj_removed += 1
print(f'Removed {obj_removed} files.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Removed 0 files.

#### Process Song Data
The files are partitioned by the first three letters of each song's track ID.

In [6]:
# define schema for song data
song_schema = StructType([
    StructField('song_id', StringType()),
    StructField('artist_id', StringType()),
    StructField('artist_name', StringType()),
    StructField('artist_location', StringType()),
    StructField('artist_latitude', DoubleType()),
    StructField('artist_longitude', DoubleType()),
    StructField('duration', DoubleType()),
    StructField('num_songs', IntegerType()),
    StructField('title', StringType()),
    StructField('year', IntegerType())
])

df = spark.read.json(data_song, schema = song_schema)
df.cache()
print('Songs records: {}'.format(df.count()))
df.select(['song_id', 'title', 'artist_id', 'year', 'duration']).show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songs records: 14896
+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOVIYJY12AF72A4B00|The Dead Next Doo...|AR4T2IF1187B9ADBB7|1983|233.22077|
|SOVYXYL12AF72A3373|Rebel Yell (1999 ...|AR4T2IF1187B9ADBB7|1983|287.92118|
|SOEPTVC12A67ADD0DA|To Zucchabar ["Gl...|ARQ846I1187B9A7083|   0|196.04853|
|SOLQYSZ12AB0181F97|    Mony Mony (Live)|AR4T2IF1187B9ADBB7|1987|247.53587|
|SOVPFJK12A6701CB16|Barcelona - (Frie...|AR3TZ691187FB3DBB1|2000|273.44934|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows

Songs Table, with files partitioned by year and then artist.

In [7]:
songs_cols = ['song_id', 'title', 'artist_id', 'year', 'duration']
songs_table = df.select(songs_cols).dropDuplicates(['song_id']).repartition('year', 'artist_id')
print('Songs count: {}'.format(songs_table.count()))
songs_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songs count: 14896
+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOPNMRX12A6D4F9989|         Biggest Fan|ARRFHHE1187B98FE75|   0|238.52363|
|SOOYQEG12AC468F2FC|Yes Love_ My Soul...|ARNCTJ91187B98D813|   0| 75.62404|
|SOUNJPE12A58A78393|        Just As I Am|ARTL0JQ1187FB4D190|1985|283.16689|
|SOOHETF12A8C140411|    No Words No More|AROSPEU1187FB51179|2009|245.86404|
|SOVPOBV12AAF3B48FA|    Fallin Backwards|ARMDRLR1187FB3B0EA|   0|503.58812|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows

In [8]:
songs_table.limit(5).write.partitionBy('year', 'artist_id').parquet(data_output + key['songs']) #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Artists Table

In [9]:
artists_cols = ['artist_id',
                'artist_name as name',
                'artist_location as location',
                'artist_latitude as latitude',
                'artist_longitude as longitude']

# using selectExpr() because of 'as' column aliases present
artists_table = df.selectExpr(artists_cols).dropDuplicates(['artist_id', 'name'])
print('Artists count: {}'.format(artists_table.count()))
artists_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Artists count: 9993
+------------------+--------------------+--------+--------+---------+
|         artist_id|                name|location|latitude|longitude|
+------------------+--------------------+--------+--------+---------+
|AR1OGXT1187B9893EB|Lester Flatt / Ea...|        |    null|     null|
|AR2TN021187B998B29|            Declaime|        |    null|     null|
|AR2UXTA1187B98B0AF|   Theoretical Girls|        |    null|     null|
|AR43RZ01187FB5128D|         Lord Nelson|        |    null|     null|
|AR58CZ21187B9AF528|  WESTERNHAGEN (HCL)|        |    null|     null|
+------------------+--------------------+--------+--------+---------+
only showing top 5 rows

In [10]:
artists_table.limit(5).write.parquet(data_output + key['artists']) #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Process Log Data
The log files in the dataset you'll be working with are partitioned by year and month.

In [11]:
to_timestamp = udf(lambda x : dt.utcfromtimestamp(x / 1e3), TimestampType())
df = spark.read.json(data_log).where("page = 'NextSong'").withColumn('start_time', to_timestamp('ts'))
df.cache()
print("'NextSong' events count: {}".format(df.count()))
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'NextSong' events count: 6820
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)

Users Table

In [12]:
users_cols = ['userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level']
users_table = df.selectExpr(users_cols).dropDuplicates(['user_id'])
print('Users count: {}'.format(users_table.count()))
users_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Users count: 96
+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     51|      Maia|    Burke|     F| free|
|      7|    Adelyn|   Jordan|     F| free|
|     15|      Lily|     Koch|     F| paid|
|     54|     Kaleb|     Cook|     M| free|
|    101|    Jayden|      Fox|     M| free|
+-------+----------+---------+------+-----+
only showing top 5 rows

In [13]:
users_table.limit(5).write.parquet(data_output + key['users']) #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time Table, with files partitioned by year and month.

In [14]:
time_table = df.select('start_time').dropDuplicates() \
    .withColumn('hour', hour('start_time')) \
    .withColumn('day',  dayofmonth('start_time')) \
    .withColumn('week', weekofyear('start_time')) \
    .withColumn('month', month('start_time')) \
    .withColumn('year', year('start_time')) \
    .withColumn('weekday', date_format('start_time', 'E')) \
    .repartition('year', 'month')
print('Timestamps count: {}'.format(time_table.count()))
time_table.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Timestamps count: 6813
+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-30 05:43:...|   5| 30|  48|   11|2018|    Fri|
|2018-11-30 17:53:...|  17| 30|  48|   11|2018|    Fri|
|2018-11-16 14:07:...|  14| 16|  46|   11|2018|    Fri|
|2018-11-20 02:23:...|   2| 20|  47|   11|2018|    Tue|
|2018-11-20 21:51:...|  21| 20|  47|   11|2018|    Tue|
+--------------------+----+---+----+-----+----+-------+
only showing top 5 rows

In [15]:
time_table.limit(5).write.partitionBy('year', 'month').parquet(data_output + key['time']) #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songplays Table, with files partitioned by year and month.

In [16]:
# select specific columns to skip ambiguous ones
df = df.join(songs_table.select('song_id', 'title'), (df.song == songs_table.title)) \
       .join(artists_table.select('artist_id', 'name'), (df.artist == artists_table.name))

df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)

In [17]:
# songplay_id will be an autoincrementing column
# year and month will be added based on start_time
songplays_cols = ['start_time',
                  'userId as user_id',
                  'level',
                  'song_id',
                  'artist_id',
                  'sessionId as session_id',
                  'location',
                  'userAgent as user_agent']

songplays_table = df.selectExpr(songplays_cols).dropDuplicates(['start_time', 'user_id', 'session_id']) \
                    .withColumn('songplay_id', monotonically_increasing_id()) \
                    .withColumn('month', month('start_time')) \
                    .withColumn('year', year('start_time')) \
                    .repartition('year', 'month')

print('Songplays count: {}'.format(songplays_table.count()))
songplays_table.select(['start_time', 'user_id', 'song_id', 'session_id', 'location', 'user_agent']).show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songplays count: 638
+--------------------+-------+------------------+----------+--------------------+--------------------+
|          start_time|user_id|           song_id|session_id|            location|          user_agent|
+--------------------+-------+------------------+----------+--------------------+--------------------+
|2018-11-05 05:57:...|     57|SOCGOZK12A8151BD5D|        56|San Antonio-New B...|"Mozilla/5.0 (Mac...|
|2018-11-20 06:29:...|     15|SOKGKRW12A8C1451D0|       716|Chicago-Napervill...|"Mozilla/5.0 (X11...|
|2018-11-16 16:27:...|     90|SOMUJKC12AB01865AD|       148|Pensacola-Ferry P...|Mozilla/5.0 (X11;...|
|2018-11-28 16:51:...|     14|SOIBFHF12AAF3B5237|       929|       Red Bluff, CA|Mozilla/5.0 (Wind...|
|2018-11-05 14:39:...|     91|SOODHLO12AF72A1980|        90|Dallas-Fort Worth...|Mozilla/5.0 (comp...|
+--------------------+-------+------------------+----------+--------------------+--------------------+
only showing top 5 rows

In [18]:
songplays_table.limit(5).write.partitionBy('year', 'month').parquet(data_output + key['songplays']) #.limit(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…