In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [4]:
songSchema = R([
    Fld("num_songs",Int()),
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_name",Str()),  
    Fld("title",Str()),  
    Fld("duration",Dbl()),  
    Fld("year",Int()),  
])

In [5]:
df = spark.read.json("s3a://udacity-dend/song_data/A/A/*/*.json", schema=songSchema)

In [6]:
df.printSchema()

root
 |-- num_songs: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)



In [7]:
df.show(5)

+---------+------------------+---------------+----------------+--------------------+--------------------+--------------------+----------+----+
|num_songs|         artist_id|artist_latitude|artist_longitude|     artist_location|         artist_name|               title|  duration|year|
+---------+------------------+---------------+----------------+--------------------+--------------------+--------------------+----------+----+
|        1|ARSUVLW12454A4C8B8|       35.83073|       -85.97874|           Tennessee|Royal Philharmoni...|Faust: Ballet Mus...|  94.56281|   0|
|        1|ARXQC081187FB4AD42|       54.31407|        -2.23001|                  UK|William Shatner_ ...|Exodus: Part I: M...|1047.71873|   0|
|        1|ARWUNH81187FB4A3E0|           null|            null|     Miami , Florida|         Trick Daddy|Take It To Da Hou...| 227.10812|2001|
|        1|ARTC1LV1187B9A4858|        51.4536|        -0.01802|Goldsmith's Colle...|  The Bonzo Dog Band|King Of Scurf (20...| 301.40036|1972|

In [8]:
df.count()

604