In [9]:
from pyspark.sql import SparkSession

import datetime

In [2]:
spark = SparkSession.builder.appName("Wrangling Data SQL").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 17:44:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/26 17:44:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
user_log_df = spark.read.json("sparkify_log_small.json")

In [4]:
user_log_df.createOrReplaceTempView("user_log_table") # temporary view to run SQL queries

In [5]:
spark.sql("SELECT * FROM user_log_table LIMIT 2").show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [6]:
spark.sql("SELECT count(*) FROM user_log_table").show()

+--------+
|count(1)|
+--------+
|   10000|
+--------+



In [7]:
spark.sql("SELECT userID, firstname, page, song FROM user_log_table WHERE userID == '1046'").show()

+------+---------+--------+--------------------+
|userID|firstname|    page|                song|
+------+---------+--------+--------------------+
|  1046|  Kenneth|NextSong|Christmas Tears W...|
|  1046|  Kenneth|NextSong|  Be Wary Of A Woman|
|  1046|  Kenneth|NextSong|   Public Enemy No.1|
|  1046|  Kenneth|NextSong|Reign Of The Tyrants|
|  1046|  Kenneth|NextSong|      Father And Son|
|  1046|  Kenneth|NextSong|               No. 5|
|  1046|  Kenneth|NextSong|           Seventeen|
|  1046|  Kenneth|    Home|                NULL|
|  1046|  Kenneth|NextSong|          War on war|
|  1046|  Kenneth|NextSong|   Killermont Street|
|  1046|  Kenneth|NextSong|        Black & Blue|
|  1046|  Kenneth|  Logout|                NULL|
|  1046|  Kenneth|    Home|                NULL|
|  1046|  Kenneth|NextSong|     Heads Will Roll|
|  1046|  Kenneth|NextSong|Bleed It Out [Liv...|
|  1046|  Kenneth|NextSong|              Clocks|
|  1046|  Kenneth|NextSong|           Love Rain|
|  1046|  Kenneth|Ne

In [8]:
spark.sql("SELECT DISTINCT page FROM user_log_table ORDER BY page ASC").show()

+----------------+
|            page|
+----------------+
|           About|
|       Downgrade|
|           Error|
|            Help|
|            Home|
|           Login|
|          Logout|
|        NextSong|
|   Save Settings|
|        Settings|
|Submit Downgrade|
|  Submit Upgrade|
|         Upgrade|
+----------------+



In [10]:
spark.udf.register("get_hour", lambda x: int(datetime.datetime.fromtimestamp(x / 1000.0).hour))

spark.sql("SELECT *, get_hour(ts) AS hour FROM user_log_table LIMIT 1").show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+----+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|hour|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+----+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|   1|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+----------

In [11]:
songs_in_hour_df = spark.sql('''
          SELECT get_hour(ts) AS hour, COUNT(*) as plays_per_hour
          FROM user_log_table
          WHERE page = "NextSong"
          GROUP BY hour
          ORDER BY cast(hour as int) ASC
          '''
          )

songs_in_hour_df.show()



+----+--------------+
|hour|plays_per_hour|
+----+--------------+
|   0|           248|
|   1|           369|
|   2|           375|
|   3|           456|
|   4|           454|
|   5|           382|
|   6|           302|
|   7|           352|
|   8|           276|
|   9|           348|
|  10|           358|
|  11|           375|
|  12|           249|
|  13|           216|
|  14|           228|
|  15|           251|
|  16|           339|
|  17|           462|
|  18|           479|
|  19|           484|
+----+--------------+
only showing top 20 rows



                                                                                

Which page did user id "" (empty string) NOT visit?

In [14]:
spark.sql("SELECT DISTINCT page FROM user_log_table WHERE userId == ''").show() # visited pages

+-----+
| page|
+-----+
| Home|
|About|
|Login|
| Help|
+-----+



How many female users do we have in the data set?

In [20]:
spark.sql("SELECT count(distinct userId) FROM user_log_table WHERE gender = 'F'").show()

+----------------------+
|count(DISTINCT userId)|
+----------------------+
|                   462|
+----------------------+



How many songs were played from the most played artist?

In [29]:
spark.sql('''
          SELECT COUNT(*) AS songs_played, artist 
          FROM user_log_table 
          WHERE page = 'NextSong'
          GROUP BY artist 
          ORDER BY songs_played DESC
          LIMIT 1
          '''
          ).show()

+------------+--------+
|songs_played|  artist|
+------------+--------+
|          83|Coldplay|
+------------+--------+

