In [8]:
import os

from pyspark.sql.functions import to_timestamp

INPUT_DATA = 's3a://udacity-dend/'
OUTPUT_DATA = 's3://dend-results/'
SONG_DATA_PATTERN = INPUT_DATA + 'song_data/*/*/*/*.json'
LOG_DATA_PATTERN = INPUT_DATA + 'log-data/*/*/*.json'
output_data = OUTPUT_DATA
input_data = INPUT_DATA

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
song_temp_view = 'songs'
log_temp_view = 'logs'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
select_artists = f"""
    SELECT DISTINCT artist_id,
        artist_name,
        artist_location,
        artist_latitude,
        artist_longitude
    FROM {song_temp_view}
    WHERE artist_id IS NOT NULL
    """

select_songs = f"""
    SELECT DISTINCT song_id,
        title,
        artist_id,
        year,
        duration
    FROM {song_temp_view}
    WHERE song_id IS NOT NULL
"""

select_songplays = f"""
    SELECT logs.start_time,
        logs.userId AS user_id,
        logs.level,
        songs.song_id,
        songs.artist_id,
        logs.sessionId AS session_id,
        logs.location,
        logs.userAgent AS user_agent,
        EXTRACT(
            month
            FROM logs.start_time
        ) AS month,
        EXTRACT(
            year
            FROM logs.start_time
        ) AS year
    FROM {log_temp_view} LEFT OUTER JOIN {song_temp_view} 
        ON song = songs.title
            AND logs.artist = songs.artist_name
            AND logs.length = songs.duration
"""

select_time = f"""
    SELECT DISTINCT start_time,
        EXTRACT(
            hour
            FROM start_time
        ) AS hour,
        EXTRACT(
            day
            FROM start_time
        ) AS day,
        EXTRACT(
            week
            FROM start_time
        ) AS week,
        EXTRACT(
            month
            FROM start_time
        ) AS month,
        EXTRACT(
            year
            FROM start_time
        ) AS year,
        EXTRACT(
            dayofweek
            FROM start_time
        ) AS dayofweek
    FROM {log_temp_view}
    WHERE start_time IS NOT NULL
"""

select_users = f"""
    WITH last_song_play_times AS (
        SELECT userId, MAX(start_time) AS last_song_play
        FROM logs
    GROUP BY userId
    )
    SELECT logs.userId,
        logs.firstName,
        logs.lastName,
        logs.gender,
        logs.level
    FROM {log_temp_view}
    INNER JOIN last_song_play_times t 
        ON logs.userId = t.userId 
        AND logs.start_time = t.last_song_play
"""

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
song_df = spark.read.json(SONG_DATA_PATTERN)
song_df.createOrReplaceTempView(song_temp_view)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
songs_table = spark.sql(select_songs)
songs_table.write.partitionBy(['year', 'artist_id']).parquet(
    os.path.join(output_data, 'songs.parquet')
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
artists_table = spark.sql(select_artists)
artists_table.write.parquet(os.path.join(output_data, 'artists.parquet'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
df = spark.read.json(LOG_DATA_PATTERN)
df = df.filter("page == 'NextSong'")  # filter by actions for song plays
df = df.withColumn('start_time', to_timestamp(df.ts / 1000))
df.createOrReplaceTempView(log_temp_view)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
users_table = spark.sql(select_users)
users_table.write.parquet(os.path.join(output_data, 'users.parquet'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
time_table = spark.sql(select_time)
time_table.write.partitionBy(['year', 'month']).parquet(
        os.path.join(output_data, 'time.parquet')
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
songplays_table = spark.sql(select_songplays)
songplays_table.write.partitionBy(['year', 'month']).parquet(
        os.path.join(output_data, 'songplays.parquet')
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…