In [2]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [3]:
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import from_unixtime

In [4]:
## Create a Spark Session
## Since we are running the Spark Session Locally thus we dont need to configure it to connect to S3.

spark = SparkSession.builder.appName("DataLake").getOrCreate()

#### SONG Data

In [5]:
# read song data file

song_data = spark.read.json('data_local/song_data/A/*/*/*.json')   ## Which format is the spark reading the data??

In [6]:
song_data.take(1)                                                  ## is take()/show() method expensive ??

[Row(artist_id='ARDR4AC1187FB371A1', artist_latitude=None, artist_location='', artist_longitude=None, artist_name='Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti', duration=511.16363, num_songs=1, song_id='SOBAYLL12A8C138AF9', title='Sono andati? Fingevo di dormire', year=0)]

In [7]:
song_data.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [8]:
# extract columns to create songs table
# artists - artists in music database
# artist_id, name, location, lattitude, longitude

artist_data_df = song_data.select('artist_id','artist_name','artist_location','artist_latitude','artist_longitude').dropDuplicates()\
                 .where(song_data.artist_id.isNotNull())

In [9]:
## Let us use sql for the same stuff
song_data.createOrReplaceTempView("song")

artist_data_sql = spark.sql("SELECT DISTINCT artist_id, artist_name AS name, artist_location AS location ,artist_latitude AS latitude, \
artist_longitude AS longitude FROM song WHERE artist_id IS NOT NULL")

In [10]:
artist_data_df.show(5)

+------------------+---------------+---------------+---------------+----------------+
|         artist_id|    artist_name|artist_location|artist_latitude|artist_longitude|
+------------------+---------------+---------------+---------------+----------------+
|AR3JMC51187B9AE49D|Backstreet Boys|    Orlando, FL|       28.53823|       -81.37739|
|AR0IAWL1187B9A96D0|   Danilo Perez|         Panama|         8.4177|       -80.11278|
|ARWB3G61187FB49404|    Steve Morse| Hamilton, Ohio|           null|            null|
|AR47JEX1187B995D81|   SUE THOMPSON|     Nevada, MO|       37.83721|       -94.35868|
|ARHHO3O1187B989413|      Bob Azzam|               |           null|            null|
+------------------+---------------+---------------+---------------+----------------+
only showing top 5 rows



In [11]:
artist_data_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)



In [12]:
artist_data_df.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 5 columns):
artist_id           69 non-null object
artist_name         69 non-null object
artist_location     69 non-null object
artist_latitude     31 non-null float64
artist_longitude    31 non-null float64
dtypes: float64(2), object(3)
memory usage: 2.8+ KB


In [13]:
artist_data_sql.show(5)

+------------------+------------+---------------+--------+----------+
|         artist_id|        name|       location|latitude| longitude|
+------------------+------------+---------------+--------+----------+
|ARPBNLO1187FB3D52F|    Tiny Tim|   New York, NY|40.71455| -74.00712|
|ARBEBBY1187B9B43DB|   Tom Petty|Gainesville, FL|    null|      null|
|AR0IAWL1187B9A96D0|Danilo Perez|         Panama|  8.4177| -80.11278|
|ARMBR4Y1187B9990EB|David Martin|California - SF|37.77916|-122.42005|
|ARD0S291187B9B7BF5|     Rated R|           Ohio|    null|      null|
+------------------+------------+---------------+--------+----------+
only showing top 5 rows



In [14]:
artist_data_sql.count()

69

In [15]:
artist_data_sql.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [16]:
artist_data_sql.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 5 columns):
artist_id    69 non-null object
name         69 non-null object
location     69 non-null object
latitude     31 non-null float64
longitude    31 non-null float64
dtypes: float64(2), object(3)
memory usage: 2.8+ KB


In [17]:
## Extract Songs Data
## songs - songs in music database
## song_id, title, artist_id, year, duration

songsTable_data_df = song_data.select('song_id','title','artist_id','year','duration').dropDuplicates().where(song_data.song_id.isNotNull())

In [18]:
## Let us use sql for the same stuff
songsTable_data_sql = spark.sql("SELECT DISTINCT song_id, title, artist_id, year, duration FROM song WHERE song_id IS NOT NULL ")

In [19]:
songsTable_data_df.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOTTDKS12AB018D69B|It Wont Be Christmas|ARMBR4Y1187B9990EB|   0|241.47546|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOIAZJW12AB01853F1|          Pink World|AR8ZCNI1187B9A069B|1984|269.81832|
|SONYPOM12A8C13B2D7|I Think My Wife I...|ARDNS031187B9924F0|2005|186.48771|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [20]:
songsTable_data_sql.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|ARB29H41187B98F0EF|1972|342.56934|
|SOOJPRH12A8C141995|   Loaded Like A Gun|ARBGXIG122988F409D|   0|173.19138|
|SOFCHDR12AB01866EF|         Living Hell|AREVWGE1187B9B890A|   0|282.43546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



##### --------------------------------------------------------------------------------------

### Log Data

In [21]:
# get filepath to log data file
# read the log data
log_data = spark.read.json('data_local/log_data/*.json')

In [22]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [23]:
log_data.show(5)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|           song|status|           ts|           userAgent|userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|   Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|  Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|The Prodigy|Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|The Big Gundown|

In [24]:
log_data.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [25]:
# Create view for un-filtered log data
log_data.createOrReplaceTempView("log_data_Un_filtered_view")

In [26]:
## Filter log_data
log_data_filter =  log_data.filter(log_data.page == 'NextSong')

In [27]:
log_data_filter.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540493000000.0,597,Blackbird,200,1542253449796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",61
4,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1540794000000.0,602,Best Of Both Worlds (Remastered Album Version),200,1542260935796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80


In [28]:
# filter by actions for song plays
log_data_filter.createOrReplaceTempView("log_data_filtered_view")

In [29]:
# extract columns for users table
# users - users in the app
# user_id, first_name, last_name, gender, level

user_table_df = log_data.select('userId','firstName', 'lastname', 'gender', 'level').dropDuplicates()\
.where(log_data.userId.isNotNull())

In [30]:
# SQl query for the same
user_table_sql = spark.sql("SELECT DISTINCT userId,firstname, lastname, gender , level\
                             FROM log_data_Un_filtered_view WHERE userId IS NOT NULL")

In [31]:
user_table_df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [32]:
user_table_sql.printSchema()

root
 |-- userId: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [33]:
user_table_df.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
userId       107 non-null object
firstName    105 non-null object
lastname     105 non-null object
gender       105 non-null object
level        107 non-null object
dtypes: object(5)
memory usage: 4.3+ KB


In [34]:
user_table_sql.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
userId       107 non-null object
firstname    105 non-null object
lastname     105 non-null object
gender       105 non-null object
level        107 non-null object
dtypes: object(5)
memory usage: 4.3+ KB


In [35]:
## Create time table
## time - timestamps of records in songplays broken down into specific units
## start_time, hour, day, week, month, year, weekday

In [36]:
## First Convert the millisecond to time stamp in the ts column of the log_data.
## Create a UDF to convert that

## Note : datetime.fromtimestamp(ms/1000.0) is syntax for converting ms to time stamp.
##        since we are converting a python function to a UDF of pyspark. -
##        - So we need to assign the return type @udf(TimestampType()); else it woud return a string.

@udf(TimestampType())
def conv_timestamp(ms):
    return datetime.fromtimestamp(ms/1000.0)

In [37]:
## Lets add one more column with correct usable time stamp format

log_data_filtered_timeFormat = log_data_filter.withColumn("start_time", conv_timestamp('ts'))

In [44]:
log_data_filtered_timeFormat.limit(2).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,start_time
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26.796
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21.796


In [45]:
#Select only time Stamp column from the log_data_filtered_timeFormat dataframe.

log_time_data = log_data_filtered_timeFormat.select('start_time').dropDuplicates()\
.where(log_data_filtered_timeFormat.start_time.isNotNull())

In [46]:
log_time_data.printSchema()      ## check data Type

root
 |-- start_time: timestamp (nullable = true)



In [47]:
log_time_data.limit(3).toPandas()

Unnamed: 0,start_time
0,2018-11-21 06:18:12.796
1,2018-11-21 18:49:23.796
2,2018-11-14 15:20:15.796


In [80]:
## Lets create the time related table

time_table_df = log_time_data.withColumn('hour',hour('start_time'))\
.withColumn('day',dayofmonth('start_time'))\
.withColumn('week', weekofyear('start_time'))\
.withColumn('month', month('start_time'))\
.withColumn('year',year('start_time'))\
.withColumn("weekday", date_format("start_time", 'E'))

In [81]:
time_table_df.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-21 06:18:12.796,6,21,47,11,2018,Wed
1,2018-11-21 18:49:23.796,18,21,47,11,2018,Wed
2,2018-11-14 15:20:15.796,15,14,46,11,2018,Wed
3,2018-11-05 16:31:59.796,16,5,45,11,2018,Mon
4,2018-11-13 18:00:26.796,18,13,46,11,2018,Tue


In [58]:
## Lets create the same table using SQl:

# First : create a table with a start_time in timestamp format

time_format = spark.sql("SELECT to_timeStamp(ts/1000.0) AS start_time\
                          FROM log_data_Un_filtered_view  WHERE page = 'NextSong'")

In [74]:
time_format.show(3)

+--------------------+
|          start_time|
+--------------------+
|2018-11-15 00:30:...|
|2018-11-15 00:41:...|
|2018-11-15 00:45:...|
+--------------------+
only showing top 3 rows



In [76]:
## Lets create a temp view to run sql queries.
time_format.createOrReplaceTempView("time_table_view")

##### NOTE: It can be seen that the the functions which were imported are acting in almost similar way whether its a dataframe operations or we use it in form of sql query. The only diff is that in case of dataframe operations we are refrencing columns within quotes.
##### NOTE: I reckon, this indicates that the sql functions of spark were originally created to write  SQL queries; however, with the advent of dataframe operations in python these functions were also made compatible with dataframe operations. It is also evident that the dataframe methods such as select, withColumn, where, selectWith etc are more of a sql type operation.
##### NOTE: Thus, in Spark data wrangling process we need to think more in SQL or HQL way along with dataframe. The thought process is a bit different.

In [96]:
# year, month, dayofmonth, hour, weekofyear, date_format
time_table_sql = spark.sql("SELECT  DISTINCT start_time, \
                                    hour(start_time) AS hour,\
                                    dayofmonth(start_time) AS day,\
                                    weekofyear(start_time) AS week,\
                                    month(start_time) AS month,\
                                    year(start_time) AS year,\
                                    date_format(start_time, 'E') AS weekday\
                            FROM time_table_view\
                            WHERE start_time IS NOT NULL")

In [97]:
time_table_sql.show(3)

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-15 17:33:...|  17| 15|  46|   11|2018|    Thu|
|2018-11-15 18:39:...|  18| 15|  46|   11|2018|    Thu|
|2018-11-21 09:26:...|   9| 21|  47|   11|2018|    Wed|
+--------------------+----+---+----+-----+----+-------+
only showing top 3 rows



## SONG PLAY TABLE : Combine log data and song data 

##### As mentioned in the requirements that the songplay table will only have the data with the page filtered on the basis of 'NextSong'. Thus we will be using the filtered log data view with start_time column. 

In [100]:
## create a log_data view
log_data_filtered_timeFormat.createOrReplaceTempView("log_data_filtered_timeformatted")

In [101]:
## create a song data view
song_data.createOrReplaceTempView("song_data_")

In [None]:
# songplays - records in log data associated with song plays i.e. records with page NextSong
# songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

In [108]:
## apply SQl JOIN on both views
song_play_sql = spark.sql("""SELECT monotonically_increasing_id() AS songplay_id,
                                  start_time,
                                  userId AS user_id,
                                  level,
                                  song_id,
                                  artist_id,
                                  sessionId AS session_id,
                                  location,
                                  userAgent AS user_agent
                           FROM  log_data_filtered_timeformatted 
                           JOIN song_data_ 
                           ON artist = artist_name AND song = title """)

In [110]:
song_play_sql.printSchema()

root
 |-- songplay_id: long (nullable = false)
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)



In [117]:
## Lets do the same thing using dataframe operations.

song_play_df = log_data_filtered_timeFormat.join(song_data, ((log_data_filtered_timeFormat.artist == song_data.artist_name) & (log_data_filtered_timeFormat.song == song_data.title))).withColumn("songplay_id", monotonically_increasing_id())                                              

In [120]:
song_play_df.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 30 columns):
artist              1 non-null object
auth                1 non-null object
firstName           1 non-null object
gender              1 non-null object
itemInSession       1 non-null int64
lastName            1 non-null object
length              1 non-null float64
level               1 non-null object
location            1 non-null object
method              1 non-null object
page                1 non-null object
registration        1 non-null float64
sessionId           1 non-null int64
song                1 non-null object
status              1 non-null int64
ts                  1 non-null int64
userAgent           1 non-null object
userId              1 non-null object
start_time          1 non-null datetime64[ns]
artist_id           1 non-null object
artist_latitude     1 non-null float64
artist_location     1 non-null object
artist_longitude    1 non-null float64
artist_name      

In [121]:
song_play_df2 = song_play_df.select('songplay_id',\
                                    "start_time",\
                                    col("userId").alias("user_id"),\
                                    "level",\
                                    "song_id",\
                                    "artist_id",\
                                    col("sessionId").alias("session_id"),\
                                    "location",\
                                    col("userAgent").alias("user_agent"))

In [122]:
song_play_df2.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 9 columns):
songplay_id    1 non-null int64
start_time     1 non-null datetime64[ns]
user_id        1 non-null object
level          1 non-null object
song_id        1 non-null object
artist_id      1 non-null object
session_id     1 non-null int64
location       1 non-null object
user_agent     1 non-null object
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 152.0+ bytes
