In [1]:
import configparser
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, count, monotonically_increasing_id
from pyspark.sql.functions import hour, dayofmonth, weekofyear, month, year, date_format
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType
from datetime import datetime as dt

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1612034773106_0003,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Populate the staging tables
The log files are partitioned by year and month.

In [2]:
log_data = 's3a://udacity-dend/log-data/*/*/*.json'
staging_events = spark.read.json(log_data)
staging_events.cache()
print('Events records: {}'.format(staging_events.count()))
staging_events.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Events records: 8056
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

The songs files are partitioned by the first three letters of each song's track ID.

In [3]:
song_data = 's3a://udacity-dend/song_data/*/*/*/*.json'

# define schema for the song data
song_schema = StructType([
    StructField('song_id', StringType()),
    StructField('artist_id', StringType()),
    StructField('artist_name', StringType()),
    StructField('artist_location', StringType()),
    StructField('artist_latitude', DoubleType()),
    StructField('artist_longitude', DoubleType()),
    StructField('duration', DoubleType()),
    StructField('num_songs', IntegerType()),
    StructField('title', StringType()),
    StructField('year', IntegerType())
])

staging_songs = spark.read.json(song_data, schema = song_schema)
staging_songs.cache()
print('Songs records: {}'.format(staging_songs.count()))
staging_songs.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Songs records: 14896
+------------------+------------------+--------------------+--------------------+---------------+----------------+---------+---------+--------------------+----+
|           song_id|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude| duration|num_songs|               title|year|
+------------------+------------------+--------------------+--------------------+---------------+----------------+---------+---------+--------------------+----+
|SOVIYJY12AF72A4B00|AR4T2IF1187B9ADBB7|          Billy Idol|<a href="http://b...|       63.96027|        10.22442|233.22077|        1|The Dead Next Doo...|1983|
|SOVYXYL12AF72A3373|AR4T2IF1187B9ADBB7|          Billy Idol|<a href="http://b...|       63.96027|        10.22442|287.92118|        1|Rebel Yell (1999 ...|1983|
|SOEPTVC12A67ADD0DA|ARQ846I1187B9A7083|Yvonne S. Moriart...|                    |           null|            null|196.04853|        1|To Zucchabar ["Gl...|   0|
|SOLQYSZ12AB0

### Investigate the staging tables
How many fact records can we potentially have in table `songplays`?

In [4]:
songplays = staging_events.where('''
    page = 'NextSong'
    AND ts is NOT NULL
    AND userId is NOT NULL
    AND level  is NOT NULL
''').dropDuplicates(['ts', 'userID'])
songplays.cache()
songplays.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6820

Are there `songplays` for which no song can be matched from `staging_songs` by the artist name and song title?

In [5]:
songplays.join(staging_songs,
               (songplays.song == staging_songs.title) &
               (songplays.artist == staging_songs.artist_name),
               how = 'left_anti' # https://stackoverflow.com/a/54554055
              ).count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6487

How many potential records in `time` dimension? The number of actual records will not exceed the number of facts in `songplays`.

In [6]:
songplays.dropDuplicates(['ts']).count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6813

So are there multiple `NextSong` events at the same time?

In [7]:
songplays.groupBy(songplays.ts) \
         .agg(count(songplays.userId).alias('users')) \
         .where(col('users') > 1) \
         .show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-----+
|           ts|users|
+-------------+-----+
|1542308104796|    2|
|1543339730796|    2|
|1542171216796|    2|
|1543435163796|    2|
|1543069787796|    2|
|1542984111796|    2|
|1543422975796|    2|
+-------------+-----+

How many users are there in `users` dimension?

In [8]:
songplays.dropDuplicates(['userId', 'firstName', 'lastName', 'gender']).count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

96

For how many users at least one song can be matched from `staging_songs` by the artist name and song title??

In [9]:
songplays.dropDuplicates(['userId', 'firstName', 'lastName', 'gender']) \
         .join(staging_songs,
               (songplays.song == staging_songs.title) &
               (songplays.artist == staging_songs.artist_name),
               how = 'left_semi' # https://stackoverflow.com/a/54554055
              ) \
         .select(['userId', 'firstName', 'lastName', 'gender']).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+---------+--------+------+
|userId|firstName|lastName|gender|
+------+---------+--------+------+
|    78|    Chloe|    Roth|     F|
|    33|  Bronson|  Harris|     M|
|    94|     Noah|  Chavez|     M|
|    40|   Tucker|Garrison|     M|
|    54|    Kaleb|    Cook|     M|
|    23|   Morris| Gilmore|     M|
+------+---------+--------+------+

How many songs are there in `songs` dimension?

In [10]:
staging_songs.dropDuplicates(['song_id']).where("song_id is NOT NULL").count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

14896

How many of these songs will be mentioned in `songplays`?

In [11]:
staging_songs.join(songplays,
                   (songplays.song == staging_songs.title) &
                   (songplays.artist == staging_songs.artist_name),
                   how = 'left_semi' # https://stackoverflow.com/a/54554055
                  ) \
             .dropDuplicates(['song_id']) \
             .count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

217

How many artists are there in `artists` dimension?

In [12]:
staging_songs.where("artist_id is NOT NULL").dropDuplicates(['artist_id', 'artist_name']).count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

9993

Are there artists with different names under the same id?

In [13]:
artist_multinames = staging_songs \
    .dropDuplicates(['artist_id', 'artist_name']) \
    .groupBy('artist_id') \
    .agg(count('artist_name').alias('names')) \
    .where(col('names') > 1) \
    .sort(col('names').desc())
print('Artists with different names under the same id: {}'.format(artist_multinames.count()))
artist_multinames.show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Artists with different names under the same id: 396
+------------------+-----+
|         artist_id|names|
+------------------+-----+
|ARMD3XX1187B9ACF84|    4|
|ARV481W1187FB38CD9|    4|
|ARTE9CG1187B99B1AF|    4|
|ARZ5H0P1187B98A1DD|    4|
|ARCBD0U1187FB466EF|    4|
|ARS26BQ1187B99466D|    4|
|AR5LTOU1187B98CAD9|    3|
|ARSWHKY1187B9B40B2|    3|
|AR3THYK1187B999F1F|    3|
|AR1OGXT1187B9893EB|    3|
+------------------+-----+
only showing top 10 rows

What do artists with multi names look like?

In [14]:
staging_songs.dropDuplicates(['artist_id', 'artist_name']) \
    .join(artist_multinames.select('artist_id', 'names'), on = 'artist_id', how = 'left_semi') \
    .select('artist_id', 'artist_name') \
    .sort('artist_id') \
    .show(truncate = False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+----------------------------------------+
|artist_id         |artist_name                             |
+------------------+----------------------------------------+
|AR03BDP1187FB5B324|Britney Spears feat. Pharrell Williams  |
|AR03BDP1187FB5B324|Britney Spears                          |
|AR040M31187B98CA41|The Bug Featuring Spaceape              |
|AR040M31187B98CA41|The Bug Featuring Ricky Ranking         |
|AR04S8J1187FB48358|Clifford Brown                          |
|AR04S8J1187FB48358|Clifford Brown / Max Roach Quintet      |
|AR065TW1187FB4C3A5|Tricky                                  |
|AR065TW1187FB4C3A5|Nearly God                              |
|AR065TW1187FB4C3A5|Tricky / The Mad Dog Reflex             |
|AR07SOR1187FB46179|The Goo Goo Dolls                       |
|AR07SOR1187FB46179|Goo Goo Dolls                           |
|AR0CANF1187B9AF35F|Dilated Peoples Featuring Devin The Dude|
|AR0CANF1187B9AF35F|Dilated Peoples                         |
|AR0CANF

How many artists will be mentioned in `songplays`?

In [15]:
staging_songs.join(songplays,
                   (songplays.song == staging_songs.title) &
                   (songplays.artist == staging_songs.artist_name),
                   how = 'left_semi' # https://stackoverflow.com/a/54554055
                  ) \
             .dropDuplicates(['artist_id']) \
             .count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

200