#### Importing the Python Libraries

In [1]:
import configparser
import os
import glob
import zipfile
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col, to_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

#### Defining the function for creating Spark Session

In [2]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

spark = create_spark_session()

#### Printing the current working directory so that data files can be extracted there

In [3]:
cwd = os.getcwd()
print("Current working directory:", cwd)

Current working directory: /workspace/home


#### Defining the variables for the log data and song data zip file

In [4]:
# Defining the variables files1 and files2 for log data and song data respectively
files1 = glob.glob('/workspace/home/data/log-data.zip')
files2 = glob.glob('/workspace/home/data/song-data.zip')

#Printing full directory path for log data
print("Log Data------>",files1)

#Printing full directory path for song data
print("Song Data----->",files2)

Log Data------> ['/workspace/home/data/log-data.zip']
Song Data-----> ['/workspace/home/data/song-data.zip']


#### Extracting the Log Data Zip file

In [5]:
# Extracting the Log Data Zip file
for file in files1:
    print('Unzipping Log Data Zip file:',file)

    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall('/workspace/home/data/extracted_data_folder/log-data/')

Unzipping Log Data Zip file: /workspace/home/data/log-data.zip


#### Extracting the Song Data Zip file

In [6]:
for file in files2:
    print('Unzipping Song Data Zip file:',file)

    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall('/workspace/home/data/extracted_data_folder/song-data/')

Unzipping Song Data Zip file: /workspace/home/data/song-data.zip


#### Defining the variables for the input & output data directory

In [7]:
input_data = "data/extracted_data_folder/"
output_data = "output/"

#### Setting the song data variable

In [8]:
# With only partial dataset as suggested in the Knowledge support ticket
song_data = input_data + "/song-data/song_data/A/A/*/"

#### Reading the JSON file into the Spark Data Frame

In [9]:
# Reading the JSON file into the Data Frame
song_data_df = spark.read.json(song_data)

# Total count of records in Song Data Frame
print("Total count of records in Song Data Frame----->",song_data_df.count())

Total count of records in Song Data Frame-----> 36


#### Creating Songs Data Table with smallert set of the columns from above data frame

In [10]:

songs_data_table = song_data_df.select(["song_id", "title", "artist_id", "year", "duration"]).distinct()


#### Displaying the sample records from the Songs Data Table

In [11]:
songs_data_table.show(15, True)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOIAZJW12AB01853F1|          Pink World|AR8ZCNI1187B9A069B|1984|269.81832|
|SONYPOM12A8C13B2D7|I Think My Wife I...|ARDNS031187B9924F0|2005|186.48771|
|SOYMRWW12A6D4FAB14|The Moon And I (O...|ARKFYS91187B98E58F|   0| 267.7024|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOQHXMF12AB0182363|     Young Boy Blues|ARGSJW91187B9B1D6B|   0|218.77506|
|SOZVMJI12AB01808AF|     Synthetic Dream|ARNPAGP1241B9C7FD4|   0|165.69424|
|SOHKNRJ12A6701D1F8|        Drop of Rain|AR10USD1187B99F3F1|   0|189.57016|
|SOMJBYD12A6D4F8557|Keepin It Real (S...|ARD0S291187B9B7BF5|   0|114.78159|
|SOLLHMX12AB

#### Creating the Spark Data Frame for the Log Data

In [12]:
log_data = input_data + "/log-data/"
log_data_df = spark.read.json(log_data)
log_data_df.show(5, True)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|           song|status|           ts|           userAgent|userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|   Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|  Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|The Prodigy|Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|The Big Gundown|

In [13]:
# Getting filepath to song data file
song_data = input_data + "/song-data/song_data/A/A/*/"

# Reading song data file
df = spark.read.json(song_data)

# Extracting columns to create songs table with distinct
songs_table = df.select(["song_id", "title", "artist_id", "year", "duration"]).distinct()

# Writing songs table to parquet files partitioned by year and artist
songs_table.write.parquet(os.path.join(output_data, 'songs/songs.parquet'), partitionBy=['year', 'artist_id'], mode='overwrite')

# Extracting columns to create artists table
artists_table =  artists_table = df['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']

# Dropping duplicates from the artists table
artists_table = artists_table.drop_duplicates(subset=['artist_id'])

# Writing artists table to parquet files
artists_table.write.parquet(os.path.join(output_data, 'artists/artists.parquet'), mode='overwrite')

#Displating 5 Rows from Artists Table
artists_table.show(5, True)

+------------------+--------------------+-----------------+---------------+----------------+
|         artist_id|         artist_name|  artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+-----------------+---------------+----------------+
|AR0RCMP1187FB3F427|    Billie Jo Spears|     Beaumont, TX|       30.08615|       -94.10158|
|ARI3BMM1187FB4255E|        Alice Stuart|       Washington|        38.8991|         -77.029|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|Morris Plains, NJ|       40.82624|       -74.47995|
|ARNTLGG11E2835DDB9|                 Clp|                 |           null|            null|
|ARKRRTF1187B9984DA|    Sonora Santanera|                 |           null|            null|
+------------------+--------------------+-----------------+---------------+----------------+
only showing top 5 rows



In [16]:
# Getting filepath to log data file
log_data = input_data + "/log-data/"

# Reading log data file
log_data_df = spark.read.json(log_data)

# Filtering by actions for song plays
log_data_df = log_data_df.where('page="NextSong"')

# Extracting columns for users table
users_table = log_data_df['userId', 'firstName', 'lastName', 'gender', 'level']

# Removing duplicates from the users tables
#users_table = users_table.drop_duplicates(subset='userId')
users_table = users_table.drop_duplicates(subset=['userId'])

# Writing the users table to parquet files
users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), mode='overwrite')

#printing rows for the users table
print("printing the users table")

# Creating timestamp column from original timestamp column with UDF lambda function
get_timestamp = udf(lambda x: str(int(int(x)/1000)))

print("get_timestamp", get_timestamp)

# Adding the timestamp column into the get timestamp data frame
log_data_df = log_data_df.withColumn('timestamp', get_timestamp('ts'))

# create datetime column from original timestamp column
get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))

print("get_datetime", get_datetime)

# Adding the datetime column into the get timestamp data frame
log_data_df = log_data_df.withColumn('datetime', get_datetime('ts')) 

# Extracting columns to create time table
time_table = log_data_df.select('ts')

# Adding the columns to the time table from the start_time column basically parsing the hour, day, week, month, year etc.
time_table = time_table.withColumn('hour', hour('ts'))
time_table = time_table.withColumn('day', dayofmonth('ts'))
time_table = time_table.withColumn('week', weekofyear('ts'))
time_table = time_table.withColumn('month', month('ts'))
time_table = time_table.withColumn('year', year('ts'))
time_table = time_table.withColumn('weekday', dayofweek('ts'))

# Writing time table to parquet files partitioned by year and month
time_table.write.parquet(os.path.join(output_data, 'time/time.parquet'), partitionBy=['year', 'month'], mode='overwrite')

# Reading in song data to use for songplays table
song_df_1 = spark.read.json(input_data+'/song-data/song_data/A/A/*/*.json')

# Extracting the columns from the Song
#song_df_1 = song_df_1['datetime', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent'].distinct()

# Extracting columns from joined song and log datasets to create songplays table 
songplays_table = log_data_df.join(song_df_1, (log_data_df.song == song_df_1.title) & 
  (log_data_df.artist == song_df_1.artist_name) & 
  (log_data_df.length ==  song_df_1.duration), how='inner')

# Writing songplays table to parquet files partitioned by year and month
songplays_table = song_log_joined_table.distinct() \
.select("userId", "timestamp", "song_id", "artist_id", "level", "sessionId", "location", "userAgent" ) \
.withColumn("songplay_id", F.row_number().over( Window.partitionBy('timestamp').orderBy("timestamp"))) \
.withColumnRenamed("userId","user_id")\
.withColumnRenamed("timestamp","start_time")  \
.withColumnRenamed("sessionId","session_id")  \
.withColumnRenamed("userAgent", "user_agent") \

# Writing songplays table to parquet files partitioned by year and month
songplays_table.write.parquet(output_data + 'songplays/' + 'songplays.parquet',partitionBy=['start_time', 'user_id'])

songplays_table.show(5)


printing the users table
get_timestamp <function <lambda> at 0x7f1d44d349d8>
get_datetime <function <lambda> at 0x7f1d44d34378>


AnalysisException: "cannot resolve 'hour(`ts`)' due to data type mismatch: argument 1 requires timestamp type, however, '`ts`' is of bigint type.;;\n'Project [ts#566L, hour(ts#566L, Some(Etc/UTC)) AS hour#667]\n+- Project [ts#566L]\n   +- Project [artist#551, auth#552, firstName#553, gender#554, itemInSession#555L, lastName#556, length#557, level#558, location#559, method#560, page#561, registration#562, sessionId#563L, song#564, status#565L, ts#566L, userAgent#567, userId#568, timestamp#624, <lambda>(ts#566L) AS datetime#645]\n      +- Project [artist#551, auth#552, firstName#553, gender#554, itemInSession#555L, lastName#556, length#557, level#558, location#559, method#560, page#561, registration#562, sessionId#563L, song#564, status#565L, ts#566L, userAgent#567, userId#568, <lambda>(ts#566L) AS timestamp#624]\n         +- Filter (page#561 = NextSong)\n            +- Relation[artist#551,auth#552,firstName#553,gender#554,itemInSession#555L,lastName#556,length#557,level#558,location#559,method#560,page#561,registration#562,sessionId#563L,song#564,status#565L,ts#566L,userAgent#567,userId#568] json\n"