In [1]:
import configparser
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

### This Document

is only for validation of the created analytical tables. We import every new created table and show their schema and a couple of rows as a data check

### Create Spark Session

In [2]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

### Import all five tables and do a quality check by printing their Schema and some rows

In [3]:
df = spark.read.parquet('output/analytics/artists/')
df.printSchema()
df.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)

+------------------+--------------------+--------------------+---------------+----------------+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+--------------------+---------------+----------------+
|ARNF6401187FB57032|   Sophie B. Hawkins|New York, NY [Man...|       40.79086|       -73.96644|
|AROUOZZ1187B9ABE51|         Willie Bobo|New York, NY [Spa...|       40.79195|       -73.94512|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|           null|            null|
|ARD842G1187B997376|          Blue Rodeo|Toronto, Ontario,...|       43.64856|       -79.38533|
|ARDR4AC1187FB371A1|Montserrat Caball...|                    |           null|            null|


In [4]:
df = spark.read.parquet('output/analytics/songs/')
df.printSchema()
df.show(5)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|2000|ARPBNLO1187FB3D52F|
|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48771|2005|ARDNS031187B9924F0|
|SODREIN12A58A7F2E5|A Whiter Shade Of...|326.00771|   0|ARLTWXK1187FB5A3F8|
|SOYMRWW12A6D4FAB14|The Moon And I (O...| 267.7024|   0|ARKFYS91187B98E58F|
|SOWQTQZ12A58A7B63E|Streets On Fire (...|279.97995|   0|ARPFHN61187FB575F6|
+------------------+--------------------+---------+----+------------------+
only showing top 5 rows



In [5]:
df = spark.read.parquet('output/analytics/users/')
df.printSchema()
df.show(5)

root
 |-- userId: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+------+---------+---------+------+-----+
|userId|firstName| lastName|gender|level|
+------+---------+---------+------+-----+
|    88| Mohammad|Rodriguez|     M| free|
|    88| Mohammad|Rodriguez|     M| paid|
|    11|Christian|   Porter|     F| free|
|    69| Anabelle|  Simpson|     F| free|
|    53|  Celeste| Williams|     F| free|
+------+---------+---------+------+-----+
only showing top 5 rows



In [6]:
df_time = spark.read.parquet('output/analytics/time/')
df_time.printSchema()
df_time.show(5)

root
 |-- start_time: long (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------------+----+---+----+-------+----+-----+
|   start_time|hour|day|week|weekday|year|month|
+-------------+----+---+----+-------+----+-----+
|1542253449796|   3| 15|  46|      5|2018|   11|
|1542313967796|  20| 15|  46|      5|2018|   11|
|1542776860796|   5| 21|  47|      4|2018|   11|
|1542785123796|   7| 21|  47|      4|2018|   11|
|1542804369796|  12| 21|  47|      4|2018|   11|
+-------------+----+---+----+-------+----+-----+
only showing top 5 rows



In [9]:
df = spark.read.parquet('output/analytics/songplays/')
df.printSchema()
df.where(col('artist_id').isNotNull()).show(1)

root
 |-- songplays_id: long (nullable = true)
 |-- start_time: long (nullable = true)
 |-- userId: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- location: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+------------+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+----+-----+
|songplays_id|   start_time|userId|level|           song_id|         artist_id|sessionId|            location|           userAgent|year|month|
+------------+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+----+-----+
|         882|1542837407796|    15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|      818|Chicago-Napervill...|"Mozilla/5.0 (X11...|20