In [1]:
!pip install spark-nlp==1.7.3



In [2]:
!java -version
!python --version

openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)
Python 3.7.6


In [3]:
from pyspark.sql.types import TimestampType, StructType, StructField, StringType, FloatType, IntegerType, LongType
from pyspark.sql import types as T
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StringType, DataType
import pandas as pd
from pyspark.sql.functions import udf,col
from pyspark.sql import functions as F
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear
from pyspark.sql.functions import monotonically_increasing_id
import configparser
from pyspark.sql import SparkSession
import os

In [4]:
# MY FUNCTIONS

In [5]:
def check_parquet(parquet_path):
    ! ls 2>&1 -lh $parquet_path | head -10
    ! echo 'Parquet Files:' $(ls | wc -l)
    table_parquet = spark.read.parquet(parquet_path)
    print('DataFrame rows: %d' % table_parquet.count())
    print('DataFrame schema: %s' % table_parquet)
    table_parquet.show(10, False)
    return table_parquet

In [6]:
def clean_timestamp(df):
    # convert timestamps to date time from epoch time so we can get hour of the day
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x/1000), T.TimestampType())
    # add a new column `formated_ts` in our dataframe
    df_log_copy = df.withColumn("formated_ts", get_timestamp(df.ts))
    df_formated = df_log_copy.dropna(subset='ts')
    return df_formated

In [7]:
def create_songs_table(df):
    table = df_song \
        .select("song_id", "title", "artist_id", "year", "duration") \
        .filter('song_id != "" and title != "" and artist_id != ""') \
        .sort("song_id") \
        .drop_duplicates(['song_id'])
    return(table) 

In [8]:
def write_parquet_song(table, parquet_path):
    table.write.partitionBy("year", "artist_id").parquet(parquet_path, mode = 'overwrite')

In [35]:
def write_parquet(table, parquet_path):
    table.write.parquet(parquet_path, mode = 'overwrite')

In [36]:
def write_parquet_time(table, parquet_path):
    table.write.partitionBy(['year', 'month']).parquet(parquet_path, mode = 'overwrite')

# GET AWS KEYS

In [9]:
config = configparser.ConfigParser()
config.read('dl.cfg')
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['KEY']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['SECRET']

# INITIATE SPARK SESSION

In [10]:
def create_spark_session():
    """
        Create or load a Spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

spark = create_spark_session() 

# PROCESS SONG DATA

In [11]:
input_song = "s3a://udacity-dend/song_data/A/A/A/*.json"
#df =spark.read.format("json").load(song_data_path)

In [12]:
def process_song_data(spark, input_data):
    '''
    process song data
    
    return df_song
    '''
    # read  data file
    song_schema = StructType([
        StructField("num_songs", IntegerType()),
        StructField("artist_id", StringType()),
        StructField("artist_latitude", FloatType()),
        StructField("artist_longitude", FloatType()),
        StructField("artist_location", StringType()),
        StructField("artist_name", StringType()),
        StructField("song_id", StringType()),
        StructField("title", StringType()),
        StructField("duration", FloatType()),
        StructField("year", IntegerType())
    ])
    
    df_song = spark.read.json(input_data, schema = song_schema)
    print('DataFrame rows: %d' % df_song.count())
    df_song.printSchema()
    print('DataFrame schema: %s' % df_song)
    return df_song

In [13]:
df_song = process_song_data(spark, input_song)

DataFrame rows: 24
root
 |-- num_songs: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: float (nullable = true)
 |-- artist_longitude: float (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- year: integer (nullable = true)

DataFrame schema: DataFrame[num_songs: int, artist_id: string, artist_latitude: float, artist_longitude: float, artist_location: string, artist_name: string, song_id: string, title: string, duration: float, year: int]


### Create songs Table

In [14]:
def create_songs_table(df):
    table = df_song \
        .select("song_id", "title", "artist_id", "year", "duration") \
        .filter('song_id != "" and title != "" and artist_id != ""') \
        .sort("song_id") \
        .drop_duplicates(['song_id'])
    return(table)   

In [34]:
# process and check
songs_table = create_songs_table(df_song)
parquet_path = 'output/songs_table'
write_parquet_song(songs_table, parquet_path)
check_parquet(parquet_path)

total 56K
-rw-r--r--  1 anthelix users    0 Mar 21 18:12 _SUCCESS
drwxr-xr-x 11 anthelix users 4.0K Mar 21 18:12 year=0
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=1969
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=1972
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=1978
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=1985
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=1989
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=2000
drwxr-xr-x  3 anthelix users 4.0K Mar 21 18:12 year=2001
Parquet Files: 20
DataFrame rows: 24
DataFrame schema: DataFrame[song_id: string, title: string, duration: float, year: int, artist_id: string]
+------------------+------------------------------------------------------+---------+----+------------------+
|song_id           |title                                                 |duration |year|artist_id         |
+------------------+------------------------------------------------------+---------+----+------------------+
|SOKTJD

DataFrame[song_id: string, title: string, duration: float, year: int, artist_id: string]

### Create Artists Table

In [17]:
def create_artists_table(df):
    table = df \
        .selectExpr("artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude") \
        .filter('artist_id != "" and name != ""') \
        .sort("artist_id") \
        .drop_duplicates(['artist_id'])
    return table

In [19]:
# process and check
artists_table = create_artists_table(df_song)
parquet_path = 'output/artists_table'
write_parquet(artists_table, parquet_path)
check_parquet(parquet_path)

total 96K
-rw-r--r-- 1 anthelix users 1.5K Mar 21 18:01 part-00000-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.7K Mar 21 18:01 part-00001-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.4K Mar 21 18:01 part-00002-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 21 18:01 part-00003-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 21 18:01 part-00004-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.7K Mar 21 18:01 part-00005-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 21 18:01 part-00006-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.7K Mar 21 18:01 part-00007-ad8067ba-35b4-4b07-90d3-07a73191c217-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 21 18:01 part-00008-ad8067ba-35b4-4b07-90

DataFrame[artist_id: string, name: string, location: string, latitude: float, longitude: float]

# LOG_DATA

In [20]:
input_log = "s3a://udacity-dend/log_data/*/*/*.json"

In [21]:
def process_log_data(spark, input_log):
    '''
    process log data
    
    return df_log
    '''
    # read log data file
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", IntegerType()),
        StructField("lastName", StringType()),
        StructField("length", FloatType()),    
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", FloatType()),
        StructField("sessionId", StringType()),
        StructField("song", StringType()),
        StructField("status", IntegerType()),
        StructField("ts", LongType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])
    
    df_log_raw = spark.read.json(input_log, schema = log_schema)
    df_log_next = df_log_raw.filter("page='NextSong'")
    df_log_clean=clean_timestamp(df_log_next)
    
    print('DataFrame rows: %d' % df_log_clean.count())
    df_log_clean.printSchema()
    print('DataFrame schema: %s' % df_log_clean)
    return df_log_clean

In [22]:
df_log_clean = process_log_data(spark, input_log)

DataFrame rows: 6820
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: float (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: float (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- formated_ts: timestamp (nullable = true)

DataFrame schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: int, lastName: string, length: float, level: string, location: string, method: string, page: string, registration: float, 

### Create users table

In [24]:
def create_users_table(df):
    users_table = df \
        .filter('level != ""') \
        .orderBy("ts", ascending = False) \
        .coalesce(1)\
        .selectExpr("cast(userId as Long) user_id", "firstName as first_name", "lastName as last_name", "gender", "level")\
        .drop_duplicates(subset = ['user_id'])
    return users_table

In [26]:
# process and check
users_table = create_users_table(df_log_clean)
parquet_path = 'output/users_table'
write_parquet(users_table, parquet_path)
check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users 3.5K Mar 21 18:01 part-00000-8cae0843-69cd-4e54-ae27-29cbc1c121d8-c000.snappy.parquet
-rw-r--r-- 1 anthelix users    0 Mar 21 18:01 _SUCCESS
Parquet Files: 20
DataFrame rows: 96
DataFrame schema: DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]
+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|2      |Jizelle   |Benjamin |F     |free |
|3      |Isaac     |Valdez   |M     |free |
|4      |Alivia    |Terrell  |F     |free |
|5      |Elijah    |Davis    |M     |free |
|6      |Cecilia   |Owens    |F     |free |
|7      |Adelyn    |Jordan   |F     |free |
|8      |Kaylee    |Summers  |F     |free |
|9      |Wyatt     |Scott    |M     |free |
|10     |Sylvie    |Cruz     |F     |free |
|11     |Christian |Porter   |F     |free |
+-------+----------+---------+------+-----+
only showing top 10 rows



DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]

### Create Time table

In [27]:
def create_time_table(df):
    time_df = df.select(
        col('formated_ts').alias("start_time"),
        hour(col('formated_ts')).alias('hour'),
        dayofmonth(col('formated_ts')).alias('day'),
        weekofyear(col('formated_ts')).alias('week'),
        month(col('formated_ts')).alias('month'),
        year(col('formated_ts')).alias('year')    
    ).drop_duplicates(['start_time'])
    print(time_df.show(2))
    print("ici1")
    time_table = time_df.withColumn('hour', F.hour('start_time')) \
                    .withColumn('day', F.dayofmonth('start_time')) \
                    .withColumn('year', F.year('start_time')) \
                    .withColumn('week', F.weekofyear('start_time')) \
                    .withColumn('month', F.month('start_time')) \
                    .withColumn('weekday', F.dayofweek('start_time').cast("string"))
    return time_table

In [28]:
#df_log_clean.show(2)
time_table = create_time_table(df_log_clean)
time_table.show(2)

+--------------------+----+---+----+-----+----+
|          start_time|hour|day|week|month|year|
+--------------------+----+---+----+-----+----+
|2018-11-21 06:18:...|   6| 21|  47|   11|2018|
|2018-11-14 15:20:...|  15| 14|  46|   11|2018|
+--------------------+----+---+----+-----+----+
only showing top 2 rows

None
ici1
+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-21 06:18:...|   6| 21|  47|   11|2018|      4|
|2018-11-21 18:49:...|  18| 21|  47|   11|2018|      4|
+--------------------+----+---+----+-----+----+-------+
only showing top 2 rows



In [30]:
# process and check
time_table = create_time_table(df_log_clean)
parquet_path = 'output/time_table'
write_parquet_time(time_table, parquet_path)
check_parquet(parquet_path)

+--------------------+----+---+----+-----+----+
|          start_time|hour|day|week|month|year|
+--------------------+----+---+----+-----+----+
|2018-11-21 06:18:...|   6| 21|  47|   11|2018|
|2018-11-14 15:20:...|  15| 14|  46|   11|2018|
+--------------------+----+---+----+-----+----+
only showing top 2 rows

None
ici1
total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 21 18:02 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 21 18:02 year=2018
Parquet Files: 20
DataFrame rows: 6813
DataFrame schema: DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]
+-----------------------+----+---+----+-------+----+-----+
|start_time             |hour|day|week|weekday|year|month|
+-----------------------+----+---+----+-------+----+-----+
|2018-11-15 16:36:45.796|16  |15 |46  |5      |2018|11   |
|2018-11-15 19:02:26.796|19  |15 |46  |5      |2018|11   |
|2018-11-21 15:26:35.796|15  |21 |47  |4      |2018|11   |
|2018-11-21 17:55:11.796|17  |21 |47  |

DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]

### Create the songplays fact table

In [31]:
def create_songplays_table(tl, ts,tt):
    tl = df_log_clean.alias('tl')
    ts = df_song.alias('ts')
    
    inner_join = tl.join(ts, ((tl.artist == ts.artist_name) & (tl.artist == ts.artist_name)), how='inner')    
    songplays = inner_join.withColumn("songplay_id", monotonically_increasing_id())
    
    songplays_table = songplays.selectExpr("songplay_id",
                                    "formated_ts as start_time",
                                    "cast(userId as Long) user_id",
                                    "level",
                                    "song_id",
                                    "artist_id",
                                    "sessionId as session_id",
                                    "location",
                                    "userAgent as user_agent") \
                                .withColumn('year', F.year('start_time')) \
                                .withColumn('month', F.month('start_time'))

    return songplays_table

In [37]:
# process and check
songplays_table = create_songplays_table(df_log_clean, df_song, time_table)
parquet_path = 'output/songplays_table'
write_parquet_time(songplays_table, parquet_path)
check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 21 18:15 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 21 18:15 year=2018
Parquet Files: 20
DataFrame rows: 10
DataFrame schema: DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: int, month: int]
+-----------+-----------------------+-------+-----+------------------+------------------+----------+-------------------------------------+---------------------------------------------------------------------------------------------------------------+----+-----+
|songplay_id|start_time             |user_id|level|song_id           |artist_id         |session_id|location                             |user_agent                                                                                                     |year|month|
+-----------+-----------------------+-------+-----+------------------+------------------+-------

DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: int, month: int]