# Checking result  - Data Lake on S3

In [7]:
from pyspark.sql import SparkSession
import os
import configparser

# Make sure that your AWS credentials are loaded as env vars

In [8]:
config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

S3_BUCKET_OUTPUT = config.get("S3", "S3_BUCKET_OUTPUT")
output_data = "s3a://{}/".format(S3_BUCKET_OUTPUT)

# Create spark session with hadoop-aws package

In [9]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

# Load data from S3

## songs_table

In [18]:
song_df = spark.read.parquet(output_data + 'songs_table/')

In [19]:
song_df.printSchema()
song_df.show(5)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOKTJDS12AF72A25E5|Drown In My Own T...|  192.522|   0|ARA23XO1187B9AF18F|
|SOEKAZG12AB018837E|I'll Slap Your Fa...|129.85424|2001|ARSVTNL1187B992A91|
|SOAFBCP12A8C13CC7D|King Of Scurf (20...|301.40036|1972|ARTC1LV1187B9A4858|
|SORRNOC12AB017F52B|The Last Beat Of ...|337.81506|2004|ARSZ7L31187FB4E610|
|SOQPWCR12A6D4FB2A3|A Poor Recipe For...|118.07302|2005|AR73AIO1187B9AD57B|
+------------------+--------------------+---------+----+------------------+
only showing top 5 rows



## songplays_table

In [16]:
songplay_df = spark.read.parquet(output_data + 'songplays_table/')

In [17]:
songplay_df.printSchema()
songplay_df.show(5)

root
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|year|month|
+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|2018-11-15 16:19:...|     97| paid|SOBLFFE12AF72AA5BA|ARJNIUY12298900C91|       605|Lansing-East Lans...|"Mozilla/5.0 (X11...|2018|   11|
+--------------------+-------+-----+----

## users_table

In [14]:
user_df = spark.read.parquet(output_data + 'users_table/')

In [15]:
user_df.printSchema()
user_df.show(5)

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| free|
|     75|    Joseph|Gutierrez|     M| free|
|     69|  Anabelle|  Simpson|     F| free|
|     29|Jacqueline|    Lynch|     F| free|
|     68|    Jordan|Rodriguez|     F| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



## artists_table

In [12]:
artist_df = spark.read.parquet(output_data + 'artists_table/')

In [13]:
artist_df.printSchema()
artist_df.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+------------------+--------------------+--------------------+--------+---------+
|         artist_id|         artist_name|            location|latitude|longitude|
+------------------+--------------------+--------------------+--------+---------+
|ARTC1LV1187B9A4858|  The Bonzo Dog Band|Goldsmith's Colle...| 51.4536| -0.01802|
|ARA23XO1187B9AF18F|     The Smithereens|Carteret, New Jersey|40.57885|-74.21956|
|ARSVTNL1187B992A91|       Jonathan King|     London, England|51.50632| -0.12714|
|ARZ5H0P1187B98A1DD|          Snoop Dogg|      Long Beach, CA|33.76672|-118.1924|
|ARXQBR11187B98A2CC|Frankie Goes To H...|  Liverpool, England|    null|     null|
+------------------+--------------------+--------------------+--------+---------+
only showing top 5 rows



## time_table

In [10]:
time_df = spark.read.parquet(output_data + 'time_table/')

In [11]:
time_df.printSchema()
time_df.show(5)

root
 |-- ts: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------------+--------------------+----+---+----+-------+----+-----+
|           ts|          start_time|hour|day|week|weekday|year|month|
+-------------+--------------------+----+---+----+-------+----+-----+
|1542299400796|2018-11-15 16:30:...|  16| 15|  46|      5|2018|   11|
|1542302304796|2018-11-15 17:18:...|  17| 15|  46|      5|2018|   11|
|1542313156796|2018-11-15 20:19:...|  20| 15|  46|      5|2018|   11|
|1542318492796|2018-11-15 21:48:...|  21| 15|  46|      5|2018|   11|
|1542828783796|2018-11-21 19:33:...|  19| 21|  47|      4|2018|   11|
+-------------+--------------------+----+---+----+-------+----+-----+
only showing top 5 rows

