In [1]:
!pip install spark-nlp==1.7.3



In [2]:
!java -version
!python --version

openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)
Python 3.7.6


In [3]:
from pyspark.sql.types import TimestampType, StructType, StructField, FloatType, IntegerType, LongType, StringType, DataType
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear
from pyspark.sql.functions import monotonically_increasing_id
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf,col
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from datetime import datetime
import configparser
import pandas as pd
import os

# MY FUNCTIONS

In [4]:
def check_parquet(parquet_path):
    ! ls 2>&1 -lh $parquet_path | head -10
    ! echo 'Parquet Files:' $(ls | wc -l)
    table_parquet = spark.read.parquet(parquet_path)
    print('DataFrame rows: %d' % table_parquet.count())
    print('DataFrame schema: %s' % table_parquet)
    table_parquet.show(10, False)
    return table_parquet

In [5]:
def clean_timestamp(df):
    # convert timestamps to date time from epoch time so we can get hour of the day
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x/1000), T.TimestampType())
    # add a new column `formated_ts` in our dataframe
    df_log_copy = df.withColumn("timestamp", get_timestamp(df.ts))
    # remove rows with empty ts value
    df_formated = df_log_copy.dropna(subset='timestamp')
    return df_formated

In [6]:
def write_parquet_song(table, parquet_path):
    table.write.partitionBy("year", "artist_id").parquet(parquet_path, mode = 'overwrite')

In [7]:
def write_parquet(table, parquet_path):
    table.write.parquet(parquet_path, mode = 'overwrite')

In [8]:
def write_parquet_time(table, parquet_path):
    table.write.partitionBy(['year', 'month']).parquet(parquet_path, mode = 'overwrite')

# GET AWS KEYS

In [9]:
def get_credentials():
    """
    get AWS keys
    """
    # parse file
    config = configparser.ConfigParser()
    config.read('dl.cfg')
    # set AWS variables
    os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['KEY']
    os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['SECRET']
    
get_credentials()

# INITIATE SPARK SESSION

In [10]:
def create_spark_session():
    """
        Create or load a Spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

spark = create_spark_session()
spark

In [11]:
input_data = "s3a://udacity-dend/"
output_data = "./output/"

# PROCESS SONG DATA

In [12]:

song_files = "song_data/A/A/A/*.json"
#input_song = input_data + song_files
input_song = "s3a://udacity-dend/song_data/A/A/A/*.json"

In [13]:
def process_song_data(spark, input_data):
    '''
    process song data
    
    return df_song
    '''
    # read  data file
    song_schema = StructType([
        StructField("num_songs", IntegerType()),
        StructField("artist_id", StringType()),
        StructField("artist_latitude", FloatType()),
        StructField("artist_longitude", FloatType()),
        StructField("artist_location", StringType()),
        StructField("artist_name", StringType()),
        StructField("song_id", StringType()),
        StructField("title", StringType()),
        StructField("duration", FloatType()),
        StructField("year", IntegerType())
    ])    
    df_song = spark.read.json(input_song, schema = song_schema)
    # print('DataFrame rows: %d' % df_song.count())
    df_song.printSchema()
    print('DataFrame schema: %s' % df_song)
    return df_song

In [14]:
df_song = process_song_data(spark, input_song)

root
 |-- num_songs: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: float (nullable = true)
 |-- artist_longitude: float (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- year: integer (nullable = true)

DataFrame schema: DataFrame[num_songs: int, artist_id: string, artist_latitude: float, artist_longitude: float, artist_location: string, artist_name: string, song_id: string, title: string, duration: float, year: int]


### Create songs Table

* varchar : song_id, title, artist_id
* float : user_id
* int: year
* NOT NULL : song_id, title, artist_id

In [15]:
def create_songs_table(df):
    table = df_song \
        .drop_duplicates(['song_id']) \
        .select("song_id", "title", "artist_id", "year", "duration") \
        .filter('song_id != "" and title != "" and artist_id != ""') \
        .sort("song_id") 
    return(table)

In [16]:
# process and check
songs_table = create_songs_table(df_song)
songs_table.show(2)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOABWAP12A8C13F82A|           Take Time|AR5LMPY1187FB573FE|1978|258.89914|
|SOAFBCP12A8C13CC7D|King Of Scurf (20...|ARTC1LV1187B9A4858|1972|301.40036|
+------------------+--------------------+------------------+----+---------+
only showing top 2 rows



In [17]:
songs_table.collect()
parquet_path = output_data + 'songs_table'
write_parquet_song(songs_table, parquet_path)
# check_parquet(parquet_path)

### Create Artists Table

* varchar : artist_id, name, location
* float : latitude, longitude
* NOT NULL : artist_id, name

In [18]:
def create_artists_table(df):
    table = df \
        .drop_duplicates(['artist_id']) \
        .selectExpr("artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude") \
        .filter('artist_id != "" and name != ""') \
        .sort("artist_id")
    return table

In [19]:
# process and check
artists_table = create_artists_table(df_song)
artists_table.show(2)

+------------------+--------------------+--------------------+--------+---------+
|         artist_id|                name|            location|latitude|longitude|
+------------------+--------------------+--------------------+--------+---------+
|AR0MWD61187B9B2B12|International Noi...|                    |    null|     null|
|AR10USD1187B99F3F1|Tweeterfriendly M...|Burlington, Ontar...|    null|     null|
+------------------+--------------------+--------------------+--------+---------+
only showing top 2 rows



In [20]:
artists_table.collect()
parquet_path = output_data + '/artists_table'
write_parquet(artists_table, parquet_path)
# check_parquet(parquet_path)

In [21]:
print(parquet_path)

./output//artists_table


# LOG_DATA

In [22]:
input_log = "s3a://udacity-dend/log_data/*/*/*.json"
print(input_log)

s3a://udacity-dend/log_data/*/*/*.json


In [23]:
input_data = "s3a://udacity-dend/"
#log_data = "log_data/*/*/*.json"
#input_log = input_data + log_data
input_log = "s3a://udacity-dend/log_data/*/*/*.json"

In [24]:
def process_log_data(spark, input_log):
    '''
    process log data
    
    return df_log
    '''
    # read log data file
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", IntegerType()),
        StructField("lastName", StringType()),
        StructField("length", FloatType()),    
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", FloatType()),
        StructField("sessionId", StringType()),
        StructField("song", StringType()),
        StructField("status", IntegerType()),
        StructField("ts", LongType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])
    
    df_log_raw = spark.read.json(input_log, schema = log_schema)
    print('DataFrame raw: %d' % df_log_raw.count())
    df_log = df_log_raw.filter("page='NextSong'")
    print('DataFrame next: %d' % df_log.count())
    df_ts=clean_timestamp(df_log)    
    # print('DataFrame rows: %d' % df_log.count())
    df_ts.printSchema()
    print('DataFrame schema: %s' % df_log)
    return df_ts

In [25]:
df_log= process_log_data(spark, input_log)

DataFrame raw: 8056
DataFrame next: 6820
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: float (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: float (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)

DataFrame schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: int, lastName: string, length: float, level: string, location: string, method: string, page: string, reg

### Create users table

* bigint: user_Id
* varchar first_name, last_name, gender
* NOT NULL : user_id, level

In [26]:
def create_users_table(df):
    users_table = df \
        .drop_duplicates(subset = ['userId']) \
        .filter('level != ""' and 'userId != ""') \
        .orderBy("ts", ascending = False) \
        .coalesce(1)\
        .selectExpr("cast(userId as Long) user_id", "firstName as first_name", "lastName as last_name", "gender", "level")   \
        .sort('user_id')
    return users_table

In [27]:
# process and check
users_table = create_users_table(df_log)
users_table.show(5)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|      2|   Jizelle| Benjamin|     F| free|
|      3|     Isaac|   Valdez|     M| free|
|      4|    Alivia|  Terrell|     F| free|
|      5|    Elijah|    Davis|     M| free|
|      6|   Cecilia|    Owens|     F| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



In [28]:
users_table.collect()
parquet_path = 'output/users_table'
write_parquet(users_table, parquet_path)
# check_parquet(parquet_path)

### Create Time table

* timestamp: start_time
* int: hour, day, week, month, year
* varchar:  weekday
* NOT NULL : start_time

In [29]:
def create_time_table(df):
    time_df = df \
        .drop_duplicates(['timestamp']) \
        .select( \
            col('timestamp').alias("start_time"),
            hour(col('timestamp')).alias('hour'),
            dayofmonth(col('timestamp')).alias('day'),
            weekofyear(col('timestamp')).alias('week'),
            month(col('timestamp')).alias('month'),
            year(col('timestamp')).alias('year')) \
        .sort('start_time')

    time_table = time_df.withColumn('hour', F.hour('start_time')) \
                    .withColumn('day', F.dayofmonth('start_time')) \
                    .withColumn('year', F.year('start_time')) \
                    .withColumn('week', F.weekofyear('start_time')) \
                    .withColumn('month', F.month('start_time')) \
                    .withColumn('weekday', F.dayofweek('start_time').cast("string"))
    return time_table

In [30]:
# process and check
time_table = create_time_table(df_log)
time_table.show(20)

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-01 21:01:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:05:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:08:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:11:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:17:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:24:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:28:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:42:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:52:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 21:55:...|  21|  1|  44|   11|2018|      5|
|2018-11-01 22:23:...|  22|  1|  44|   11|2018|      5|
|2018-11-02 01:25:...|   1|  2|  44|   11|2018|      6|
|2018-11-02 01:30:...|   1|  2|  44|   11|2018|      6|
|2018-11-02 01:34:...|   1|  2|  44|   11|2018|      6|
|2018-11-02 02:42:...|   2|  2|  44|   11|2018| 

In [31]:
time_table.collect()
parquet_path = 'output/time_table'
write_parquet_time(time_table, parquet_path)
# check_parquet(parquet_path)

### Create the songplays fact table

* bigint: songplay_id, user_id
* timestamp : start_time
* varchar : level, song_id, artist_id, session_id, location, user_agent
* int: month, year
* NOT NULL : start_time, user_id, level, session_id



In [32]:
def create_songplays_table(df, ts):
    tl = df_log.alias('tl')
    ts = df_song.alias('ts')
    
    inner_join = tl.join(ts, ((tl.artist == ts.artist_name) & (tl.artist == ts.artist_name)), how='inner')
    
    songplays = inner_join \
            .withColumn("songplay_id", monotonically_increasing_id()) \
            .filter('timestamp != ""' and 'userId != ""' and 'level != ""' and 'sessionId != ""')
    
    songplays_table = songplays \
                    .selectExpr("songplay_id",
                                    "timestamp as start_time",
                                    "cast(userId as Long) user_id",
                                    "level",
                                    "song_id",
                                    "artist_id",
                                    "sessionId as session_id",
                                    "location",
                                    "userAgent as user_agent") \
                    .sort('songplay_id') \
                    .withColumn('year', F.year('start_time')) \
                    .withColumn('month', F.month('start_time'))

    return songplays_table

In [33]:
# process and check
songplays_table = create_songplays_table(df_log, df_song)
songplays_table.show(5)

+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|songplay_id|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|year|month|
+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|          0|2018-11-15 16:55:...|     42| paid|SONRWUU12AF72A4283|ARGE7G11187FB37E05|       404|New York-Newark-J...|"Mozilla/5.0 (Win...|2018|   11|
|          1|2018-11-21 05:30:...|     97| paid|SONRWUU12AF72A4283|ARGE7G11187FB37E05|       797|Lansing-East Lans...|"Mozilla/5.0 (X11...|2018|   11|
|          2|2018-11-28 16:54:...|     14| free|SOIGHOD12A8C13B5A1|ARY589G1187B9A9F4E|       929|       Red Bluff, CA|Mozilla/5.0 (Wind...|2018|   11|
|          3|2018-11-05 02:21:...|     44| paid|SONRWUU12AF72A4283|ARGE7G11187FB37E05|       2

In [34]:
# process and check
songplays_table = create_songplays_table(df_log, df_song)
parquet_path = 'output/songplays_table'
write_parquet_time(songplays_table, parquet_path)
# check_parquet(parquet_path)

In [37]:
# check data in each table and the parquets files
%run -i '04_checkData.py'

+------------------+-----+
|song_id           |count|
+------------------+-----+
|SOABWAP12A8C13F82A|1    |
|SOAFBCP12A8C13CC7D|1    |
+------------------+-----+
only showing top 2 rows

Not Null with filter: 24 
Null with filter: 0 
+--------------------------------+-----+
|title                           |count|
+--------------------------------+-----+
|A Poor Recipe For Civic Cohesion|1    |
|Burn My Body (Album Version)    |1    |
+--------------------------------+-----+
only showing top 2 rows

Not Null with filter: 24 
Null with filter: 0 
+------------------+-----+
|artist_id         |count|
+------------------+-----+
|AR0MWD61187B9B2B12|1    |
|AR10USD1187B99F3F1|1    |
+------------------+-----+
only showing top 2 rows

Not Null with filter: 24 
Null with filter: 0 
Numbers of rows in songs_table : 24
Not Null song_id with filter: 24 
Null with song_id filter: 0 
Not Null title with filter: 24 
Null with title filter: 0 
Not Null artist_id with filter: 24 
Null with artist_id 

+------+-----+
|userId|count|
+------+-----+
|49    |689  |
|80    |665  |
|97    |557  |
|15    |463  |
|44    |397  |
+------+-----+
only showing top 5 rows

Not Null userId with filter: 6820 
Null userId with filter: 0 
+-----+-----+
|level|count|
+-----+-----+
|paid |5591 |
|free |1229 |
+-----+-----+

Not Null level with filter: 6820 
Null level with filter: 0 
Numbers of rows in users_table : 96
Not Null user_idwith filter: 96 
Null with user_id filter: 0 
total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 27 14:04 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 27 14:04 year=2018
Parquet Files: 16
DataFrame rows: 10
DataFrame schema: DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: int, month: int]
+-----------+-----------------------+-------+-----+------------------+------------------+----------+-------------------------------------+---------------

In [36]:
df9 = spark.read.parquet('output/time_table')
print('DataFrame rows: %d' % df9.count())
print('DataFrame schema: %s' % df9)
df9.select('start_time', 'hour', 'day', 'week', 'weekday', 'year', 'month') \
    .sort('year', 'month') \
    .show(10, False)

DataFrame rows: 6813
DataFrame schema: DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]
+-----------------------+----+---+----+-------+----+-----+
|start_time             |hour|day|week|weekday|year|month|
+-----------------------+----+---+----+-------+----+-----+
|2018-11-10 17:50:52.796|17  |10 |45  |7      |2018|11   |
|2018-11-10 18:15:27.796|18  |10 |45  |7      |2018|11   |
|2018-11-10 19:08:52.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:10:48.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:14:26.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:30:33.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:35:14.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:39:08.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:43:48.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:47:29.796|19  |10 |45  |7      |2018|11   |
+-----------------------+----+---+----+-------+----+-----+
only showing top 10 rows

