In [2]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType

In [3]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config.get('IAM_USER', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('IAM_USER', 'AWS_SECRET_ACCESS_KEY')

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

spark = create_spark_session()

In [5]:
spark

In [6]:
song_data='data/song_data/*/*/*/*.json'
song_schema = StructType([
        StructField('num_songs', IntegerType()),
        StructField('artist_id', StringType()),
        StructField('artist_latitude', DoubleType()),
        StructField('artist_longitude', DoubleType()),
        StructField('artist_location', StringType()),
        StructField('artist_name', StringType()),
        StructField('song_id', StringType()),
        StructField('title', StringType()),
        StructField('duration', DoubleType()),
        StructField('year', IntegerType())
    ])
df = spark.read.json(song_data, schema=song_schema)
df.show(5)

+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|num_songs|         artist_id|artist_latitude|artist_longitude|     artist_location|         artist_name|           song_id|               title| duration|year|
+---------+------------------+---------------+----------------+--------------------+--------------------+------------------+--------------------+---------+----+
|        1|ARDR4AC1187FB371A1|           null|            null|                    |Montserrat Caball...|SOBAYLL12A8C138AF9|Sono andati? Fing...|511.16363|   0|
|        1|AREBBGV1187FB523D2|           null|            null|         Houston, TX|Mike Jones (Featu...|SOOLYAZ12A6701F4A6|Laws Patrolling (...|173.66159|   0|
|        1|ARMAC4T1187FB3FA4C|       40.82624|       -74.47995|   Morris Plains, NJ|The Dillinger Esc...|SOBBUGU12A8C13E95D|Setting Fire to S...|207.77751|2004|
|        1|ARPBNLO1187FB3D52F|    

In [11]:
df.count()

71

Schema for Song Play Analysis

Using the song and log datasets, you'll need to create a star schema optimized for queries on song play analysis. This includes the following tables.
Fact Table

    songplays - records in log data associated with song plays i.e. records with page NextSong
        songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

Dimension Tables

    users - users in the app
        user_id, first_name, last_name, gender, level
    songs - songs in music database
        song_id, title, artist_id, year, duration
    artists - artists in music database
        artist_id, name, location, lattitude, longitude
    time - timestamps of records in songplays broken down into specific units
        start_time, hour, day, week, month, year, weekday



In [10]:
songs_columns = ['song_id', 'title', 'artist_id', 'year' ,'duration']
df_songs = df.select(songs_columns)
print(df_songs.count())
df_songs.show(5)

71
+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOBAYLL12A8C138AF9|Sono andati? Fing...|ARDR4AC1187FB371A1|   0|511.16363|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|AREBBGV1187FB523D2|   0|173.66159|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOAOIBZ12AB01815BE|I Hold Your Hand ...|ARPBNLO1187FB3D52F|2000| 43.36281|
|SONWXQJ12A8C134D94|The Ballad Of Sle...|ARNF6401187FB57032|1994|  305.162|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [13]:
df_songs.drop_duplicates().count()

71

In [18]:
df_songs.createOrReplaceTempView('songs_sql')

In [21]:
songs_with_year_zero = spark.sql('''
SELECT year, count(year) FROM songs_sql
GROUP BY year HAVING year = 0
''').show()

+----+-----------+
|year|count(year)|
+----+-----------+
|   0|         43|
+----+-----------+



In [14]:
df_songs.write.partitionBy('year').parquet('parquet/songs/')

In [17]:
parquet_songs = spark.read.parquet('parquet/songs')
parquet_songs.show(5)
print(parquet_songs.count())

+------------------+--------------------+------------------+---------+----+
|           song_id|               title|         artist_id| duration|year|
+------------------+--------------------+------------------+---------+----+
|SOUQQEA12A8C134B1B|           High Tide|ARIG6O41187B988BDD| 228.5971|   0|
|SOLLHMX12AB01846DC|   The Emperor Falls|AR1Y2PT1187FB5B9CE|484.62322|   0|
|SOPEGZN12AB0181B3D|Get Your Head Stu...|AREDL271187FB40F44| 45.66159|   0|
|SOBBXLX12A58A79DDA|Erica (2005 Digit...|AREDBBQ1187B98AFF5|138.63138|   0|
|SOTCKKY12AB018A141|Sonnerie lalaleul...|ARGSAFR1269FB35070| 29.54404|   0|
+------------------+--------------------+------------------+---------+----+
only showing top 5 rows

71


In [7]:

df.createOrReplaceTempView('parquet_db')
artists_data = spark.sql('''
    SELECT DISTINCT artist_id, artist_name as name, artist_location as location, 
           artist_latitude as latitude, artist_longitude as longitude
    FROM parquet_db
''')
artists_data.show(5)
#df.selectExpr(artist_columns)

+------------------+------------+---------------+--------+----------+
|         artist_id|        name|       location|latitude| longitude|
+------------------+------------+---------------+--------+----------+
|ARPBNLO1187FB3D52F|    Tiny Tim|   New York, NY|40.71455| -74.00712|
|ARBEBBY1187B9B43DB|   Tom Petty|Gainesville, FL|    null|      null|
|AR0IAWL1187B9A96D0|Danilo Perez|         Panama|  8.4177| -80.11278|
|ARMBR4Y1187B9990EB|David Martin|California - SF|37.77916|-122.42005|
|ARD0S291187B9B7BF5|     Rated R|           Ohio|    null|      null|
+------------------+------------+---------------+--------+----------+
only showing top 5 rows



In [10]:
artist_columns = ['artist_id',
                  'artist_name as name',
                  'artist_location as location', 
                  'artist_latitude as latitude',
                  'artist_longitude as longitude']

df.selectExpr(artist_columns).dropDuplicates().show(5)

+------------------+---------------+---------------+--------+----------+
|         artist_id|           name|       location|latitude| longitude|
+------------------+---------------+---------------+--------+----------+
|ARPBNLO1187FB3D52F|       Tiny Tim|   New York, NY|40.71455| -74.00712|
|ARXR32B1187FB57099|            Gob|               |    null|      null|
|AROGWRA122988FEE45|Christos Dantis|               |    null|      null|
|ARBGXIG122988F409D|     Steel Rain|California - SF|37.77916|-122.42005|
|AREVWGE1187B9B890A|     Bitter End|      Noci (BA)| -13.442|  -41.9952|
+------------------+---------------+---------------+--------+----------+
only showing top 5 rows



In [11]:
artists_data.write.parquet('parquet/artists/')

LOG data

In [13]:
log_data = 'data/logs_data/*.json'
# two partitins in s3 datas (year and month). But no paertitions here.
df_log = spark.read.json(log_data)
df_log.take(3)


[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', itemInSession=1, lastName='Smith', length=260.07465, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='The Big Gundown', status=200, ts=1542242481796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='Train', auth='Logged In'

In [None]:
# Get only NextSong events