In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [4]:
input_data = "data" # "s3a://udacity-dend/"
output_data = "s3a://babak-udacity-project4/"

In [5]:
# read in songs table
songs = spark.read.parquet(os.path.join(output_data, 'songs'))

In [6]:
songs.printSchema()
songs.show(5)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|2000|ARPBNLO1187FB3D52F|
|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48771|2005|ARDNS031187B9924F0|
|SODREIN12A58A7F2E5|A Whiter Shade Of...|326.00771|   0|ARLTWXK1187FB5A3F8|
|SOYMRWW12A6D4FAB14|The Moon And I (O...| 267.7024|   0|ARKFYS91187B98E58F|
|SOWQTQZ12A58A7B63E|Streets On Fire (...|279.97995|   0|ARPFHN61187FB575F6|
+------------------+--------------------+---------+----+------------------+
only showing top 5 rows



In [7]:
songs.count()

71

In [8]:
# read in artists table
artists = spark.read.parquet(os.path.join(output_data, 'artists'))

In [10]:
artists.printSchema()
artists.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+------------------+--------------------+--------------------+--------+---------+
|         artist_id|                name|            location|latitude|longitude|
+------------------+--------------------+--------------------+--------+---------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                    |    null|     null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|   Morris Plains, NJ|40.82624|-74.47995|
|ARNF6401187FB57032|   Sophie B. Hawkins|New York, NY [Man...|40.79086|-73.96644|
|AROUOZZ1187B9ABE51|         Willie Bobo|New York, NY [Spa...|40.79195|-73.94512|
|ARI2JSK1187FB496EF|Nick Ingman;Gavyn...|     London, England|51.50632| -0.12714|
+------------------+--------------------+--------------------+--------+---------+
only showing top 5 rows



In [11]:
artists.count()

69

In [12]:
# read in users table
users = spark.read.parquet(os.path.join(output_data, 'users'))

In [13]:
users.printSchema()
users.show(5)

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| free|
|     75|    Joseph|Gutierrez|     M| free|
|     69|  Anabelle|  Simpson|     F| free|
|     29|Jacqueline|    Lynch|     F| free|
|     68|    Jordan|Rodriguez|     F| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



In [14]:
users.count()

104

In [15]:
# read in time table
time = spark.read.parquet(os.path.join(output_data, 'time'))

In [16]:
time.printSchema()
time.show(5)

root
 |-- start_time: long (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: string (nullable = true)

+----------+----+---+----+-----+----+-------+
|start_time|hour|day|week|month|year|weekday|
+----------+----+---+----+-----+----+-------+
|1542282991|  11| 15|  46|   11|2018|    Thu|
|1542288870|  13| 15|  46|   11|2018|    Thu|
|1542298725|  16| 15|  46|   11|2018|    Thu|
|1542307358|  18| 15|  46|   11|2018|    Thu|
|1542307903|  18| 15|  46|   11|2018|    Thu|
+----------+----+---+----+-----+----+-------+
only showing top 5 rows



In [17]:
time.count()

6813

In [19]:
# read in songplays table
songplays = spark.read.parquet(os.path.join(output_data, 'songplays'))

In [20]:
songplays.printSchema()
songplays.show(5)

root
 |-- start_time: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- songplays_id: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+----------+-------+-----+------------------+------------------+----------+--------------------+--------------------+------------+----+-----+
|start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|songplays_id|year|month|
+----------+-------+-----+------------------+------------------+----------+--------------------+--------------------+------------+----+-----+
|1542837407|     15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|  8589934592|201

In [21]:
songplays.count()

1