In [1]:
!pip install spark-nlp==1.7.3



In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StringType, DataType
import pandas as pd
from pyspark.sql.functions import udf,col
from pyspark.sql import functions as F
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql import SparkSession
pd.set_option('max_colwidth', 800)
import matplotlib
from pyspark.sql.types import TimestampType, StructType, StructField, StringType, FloatType, IntegerType, LongType
from pyspark.sql import types as T
from datetime import datetime

In [3]:
def spark_session():
    spark = SparkSession.builder \
        .appName(" Sparkify Localy") \
        .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2") \
        .getOrCreate()
    return spark

In [4]:
spark = spark_session()
# get parameters
spark.sparkContext.getConf()

<pyspark.conf.SparkConf at 0x7f23e51162d0>

## Exploration Song_data

In [5]:
#input_data = "./song_data/A/A/A/*.json"
input_song = "./song_data/*/*/*/*.json"

In [6]:
def process_song_data(spark, input_data):
    '''
    process song data
    
    return df_song
    '''
    # read  data file
    song_schema = StructType([
        StructField("num_songs", IntegerType()),
        StructField("artist_id", StringType()),
        StructField("artist_latitude", FloatType()),
        StructField("artist_longitude", FloatType()),
        StructField("artist_location", StringType()),
        StructField("artist_name", StringType()),
        StructField("song_id", StringType()),
        StructField("title", StringType()),
        StructField("duration", FloatType()),
        StructField("year", IntegerType())
    ])
    
    df_song = spark.read.json(input_data, schema = song_schema)
    print('DataFrame rows: %d' % df_song.count())
    df_song.printSchema()
    print('DataFrame schema: %s' % df_song)
    return df_song

In [7]:
df_song = process_song_data(spark, input_song)

DataFrame rows: 71
root
 |-- num_songs: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: float (nullable = true)
 |-- artist_longitude: float (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- year: integer (nullable = true)

DataFrame schema: DataFrame[num_songs: int, artist_id: string, artist_latitude: float, artist_longitude: float, artist_location: string, artist_name: string, song_id: string, title: string, duration: float, year: int]


In [8]:
df_song.show(3, False)

+---------+------------------+---------------+----------------+-----------------+----------------------------------------------------------------------------------------------+------------------+-------------------------------+---------+----+
|num_songs|artist_id         |artist_latitude|artist_longitude|artist_location  |artist_name                                                                                   |song_id           |title                          |duration |year|
+---------+------------------+---------------+----------------+-----------------+----------------------------------------------------------------------------------------------+------------------+-------------------------------+---------+----+
|1        |ARDR4AC1187FB371A1|null           |null            |                 |Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti|SOBAYLL12A8C138AF9|Sono andati? Fingevo di dormire|511.16364|0   |
|1        |AREBBGV1187FB523D

# Functions use in all the programm

In [9]:
def check_parquet(parquet_path):
    ! ls 2>&1 -lh $parquet_path | head -10
    ! echo 'Parquet Files:' $(ls | wc -l)
    table_parquet = spark.read.parquet(parquet_path)
    print('DataFrame rows: %d' % table_parquet.count())
    print('DataFrame schema: %s' % table_parquet)
    table_parquet.show(10, False)
    return table_parquet

In [10]:
def clean_timestamp(df):
    # convert timestamps to date time from epoch time so we can get hour of the day
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x/1000), T.TimestampType())
    # add a new column `formated_ts` in our dataframe
    df_log_copy = df.withColumn("formated_ts", get_timestamp(df.ts))
    df_formated = df_log_copy.dropna(subset='ts')
    return df_formated

### Create songs Table

In [11]:
def create_songs_table(df):
    table = df_song \
        .select("song_id", "title", "artist_id", "year", "duration") \
        .filter('song_id != "" and title != "" and artist_id != ""') \
        .sort("song_id") \
        .drop_duplicates(['song_id'])
    return(table)   

In [12]:
def write_parquet(table, parquet_path):
    table.write.partitionBy("year", "artist_id").parquet(parquet_path, mode = 'overwrite')

In [13]:
# process and check
songs_table = create_songs_table(df_song)
parquet_path = 'output/songs_table'
write_parquet(songs_table, parquet_path)
check_parquet(parquet_path)

total 84K
-rw-r--r--  1 anthelix users    0 Mar 22 02:31 _SUCCESS
drwxr-xr-x 43 anthelix users 4.0K Mar 22 02:31 year=0
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1961
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1964
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1969
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1972
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1982
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1984
drwxr-xr-x  3 anthelix users 4.0K Mar 22 02:31 year=1985
Parquet Files: 17
DataFrame rows: 71
DataFrame schema: DataFrame[song_id: string, title: string, duration: float, year: int, artist_id: string]
+------------------+----------------------------------------------------+---------+----+------------------+
|song_id           |title                                               |duration |year|artist_id         |
+------------------+----------------------------------------------------+---------+----+------------------+
|SOAOIBZ12AB0

DataFrame[song_id: string, title: string, duration: float, year: int, artist_id: string]

### Create Artists Table

In [14]:
def create_artists_table(df):
    table = df \
        .selectExpr("artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude") \
        .filter('artist_id != "" and name != ""') \
        .sort("artist_id") \
        .drop_duplicates(['artist_id'])
    return table

In [15]:
def write_parquet(table, parquet_path):
    table.write.parquet(parquet_path, mode = 'overwrite')

In [16]:
# process and check
artists_table = create_artists_table(df_song)
parquet_path = 'output/artists_table'
write_parquet(artists_table, parquet_path)
check_parquet(parquet_path)

total 276K
-rw-r--r-- 1 anthelix users 1.3K Mar 22 02:31 part-00000-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 22 02:31 part-00001-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 22 02:31 part-00002-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.7K Mar 22 02:31 part-00003-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 22 02:31 part-00004-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.4K Mar 22 02:31 part-00005-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 22 02:31 part-00006-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 22 02:31 part-00007-1fb9c16c-87ae-4cf5-8717-cc9324c6ae77-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 22 02:31 part-00008-1fb9c16c-87ae-4cf5-8

DataFrame[artist_id: string, name: string, location: string, latitude: float, longitude: float]

## Exploration Log_data

In [17]:
input_log = "./log_data/*.json"

In [18]:
def process_log_data(spark, input_log):
    '''
    process log data
    
    return df_log, users_table, time_table, songplays_table
    '''
    # read log data file
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", IntegerType()),
        StructField("lastName", StringType()),
        StructField("length", FloatType()),    
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", FloatType()),
        StructField("sessionId", StringType()),
        StructField("song", StringType()),
        StructField("status", IntegerType()),
        StructField("ts", LongType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])
    
    df_log_raw = spark.read.json(input_log, schema = log_schema)
    df_log_next = df_log_raw.filter("page='NextSong'")
    df_log_clean=clean_timestamp(df_log_next)
    
    print('DataFrame rows: %d' % df_log_clean.count())
    df_log_clean.printSchema()
    print('DataFrame schema: %s' % df_log_clean)
    return df_log_clean

In [19]:
df_log_clean = process_log_data(spark, input_log)

DataFrame rows: 6820
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: float (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: float (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- formated_ts: timestamp (nullable = true)

DataFrame schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: int, lastName: string, length: float, level: string, location: string, method: string, page: string, registration: float, 

In [20]:
df_log_clean.show(1, False)

+--------+---------+---------+------+-------------+--------+--------+-----+----------------------------------+------+--------+-------------+---------+-------------+------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------+------+-----------------------+
|artist  |auth     |firstName|gender|itemInSession|lastName|length  |level|location                          |method|page    |registration |sessionId|song         |status|ts           |userAgent                                                                                                                                |userId|formated_ts            |
+--------+---------+---------+------+-------------+--------+--------+-----+----------------------------------+------+--------+-------------+---------+-------------+------+-------------+---------------------------------------------------------------------------------------------------------

In [21]:
df_log_clean.take(2)

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.7775268554688, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016649728.0, sessionId='583', song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26', formated_ts=datetime.datetime(2018, 11, 15, 0, 30, 26, 796000)),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', itemInSession=1, lastName='Smith', length=260.07464599609375, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016649728.0, sessionId='583', song='The Big Gundown', status=200, ts=1542242481796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chro

### Create users table

In [22]:
def create_users_table(df):
    users_table = df \
        .filter('level != ""') \
        .orderBy("ts", ascending = False) \
        .coalesce(1)\
        .selectExpr("cast(userId as Long) user_id", "firstName as first_name", "lastName as last_name", "gender", "level")\
        .drop_duplicates(subset = ['user_id'])
    return users_table

In [23]:
def write_parquet(table, parquet_path):
    table.write.parquet(parquet_path, mode = 'overwrite')

In [24]:
# process and check
users_table = create_users_table(df_log_clean)
parquet_path = 'output/users_table'
write_parquet(users_table, parquet_path)
check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users 3.5K Mar 22 02:31 part-00000-0868f3d1-8577-4618-9626-3bbb5198df65-c000.snappy.parquet
-rw-r--r-- 1 anthelix users    0 Mar 22 02:31 _SUCCESS
Parquet Files: 17
DataFrame rows: 96
DataFrame schema: DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]
+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|2      |Jizelle   |Benjamin |F     |free |
|3      |Isaac     |Valdez   |M     |free |
|4      |Alivia    |Terrell  |F     |free |
|5      |Elijah    |Davis    |M     |free |
|6      |Cecilia   |Owens    |F     |free |
|7      |Adelyn    |Jordan   |F     |free |
|8      |Kaylee    |Summers  |F     |free |
|9      |Wyatt     |Scott    |M     |free |
|10     |Sylvie    |Cruz     |F     |free |
|11     |Christian |Porter   |F     |free |
+-------+----------+---------+------+-----+
only showing top 10 rows



DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]

### Create Time table

In [25]:
def create_time_table(df):
    time_df = df.select(
        col('formated_ts').alias("start_time"),
        hour(col('formated_ts')).alias('hour'),
        dayofmonth(col('formated_ts')).alias('day'),
        weekofyear(col('formated_ts')).alias('week'),
        month(col('formated_ts')).alias('month'),
        year(col('formated_ts')).alias('year')    
    ).drop_duplicates(['start_time'])
    time_table = time_df.withColumn('hour', F.hour('start_time')) \
                    .withColumn('day', F.dayofmonth('start_time')) \
                    .withColumn('year', F.year('start_time')) \
                    .withColumn('week', F.weekofyear('start_time')) \
                    .withColumn('month', F.month('start_time')) \
                    .withColumn('weekday', F.dayofweek('start_time').cast("string"))
    return time_table
    

In [26]:
def write_parquet(table, parquet_path):
    table.write.partitionBy(['year', 'month']).parquet(parquet_path, mode = 'overwrite')

In [27]:
# process and check
time_table = create_time_table(df_log_clean)
parquet_path = 'output/time_table'
write_parquet(time_table, parquet_path)
check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 22 02:31 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 22 02:31 year=2018
Parquet Files: 17
DataFrame rows: 6813
DataFrame schema: DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]
+-----------------------+----+---+----+-------+----+-----+
|start_time             |hour|day|week|weekday|year|month|
+-----------------------+----+---+----+-------+----+-----+
|2018-11-15 16:36:45.796|16  |15 |46  |5      |2018|11   |
|2018-11-15 19:02:26.796|19  |15 |46  |5      |2018|11   |
|2018-11-21 15:26:35.796|15  |21 |47  |4      |2018|11   |
|2018-11-21 17:55:11.796|17  |21 |47  |4      |2018|11   |
|2018-11-21 18:49:29.796|18  |21 |47  |4      |2018|11   |
|2018-11-14 09:29:50.796|9   |14 |46  |4      |2018|11   |
|2018-11-28 18:39:53.796|18  |28 |48  |4      |2018|11   |
|2018-11-28 22:46:07.796|22  |28 |48  |4      |2018|11   |
|2018-11-05 12:26:13.796|12  |5  |45  |2      |2018|11   |
|2018-11

DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]

### Create the songplays fact table

In [28]:
#time_table.createOrReplaceTempView('tmp_time')
print('Time schema: %s' % time_table)
#songs_table.createOrReplaceTempView('tmp_songs')
#print('Songs schema: %s' % songs_table)
#artists_table.createOrReplaceTempView('tmp_artists')
#print('Artists schema: %s' % artists_table)
#users_table.createOrReplaceTempView('tmp_users')
print('Users schema: %s' % users_table)
df_log_clean.createOrReplaceTempView('tmp_log')
print('LOG_CLEAN schema: %s' % df_log_clean)
df_song.createOrReplaceTempView('tmp_song')
print('DF_SONG: %s' % df_song)

Time schema: DataFrame[start_time: timestamp, hour: int, day: int, week: int, month: int, year: int, weekday: string]
Users schema: DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]
LOG_CLEAN schema: DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: int, lastName: string, length: float, level: string, location: string, method: string, page: string, registration: float, sessionId: string, song: string, status: int, ts: bigint, userAgent: string, userId: string, formated_ts: timestamp]
DF_SONG: DataFrame[num_songs: int, artist_id: string, artist_latitude: float, artist_longitude: float, artist_location: string, artist_name: string, song_id: string, title: string, duration: float, year: int]


In [29]:
def create_songplays_table(tl, ts,tt):
    tl = df_log_clean.alias('tl')
    ts = df_song.alias('ts')
    
    inner_join = tl.join(ts, ((tl.artist == ts.artist_name) & (tl.artist == ts.artist_name)), how='inner')    
    songplays = inner_join.withColumn("songplay_id", monotonically_increasing_id())
    
    songplays_table = songplays.selectExpr("songplay_id",
                                    "formated_ts as start_time",
                                    "cast(userId as Long) user_id",
                                    "level",
                                    "song_id",
                                    "artist_id",
                                    "sessionId as session_id",
                                    "location",
                                    "userAgent as user_agent",
                                    "year('formated_ts') as year",
                                    "month('formated_ts') as month")
    return songplays_table


In [30]:
def write_parquet(table, parquet_path):
    table.write.partitionBy(['year', 'month']).parquet(parquet_path, mode = 'overwrite')

In [31]:
# process and check
songplays_table = create_songplays_table(df_log_clean, df_song, time_table)
parquet_path = 'output/songplays_table'
write_parquet(songplays_table, parquet_path)
check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 22 02:31 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 22 02:31 year=__HIVE_DEFAULT_PARTITION__
Parquet Files: 17
DataFrame rows: 21
DataFrame schema: DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: null, month: null]
+-----------+-----------------------+-------+-----+------------------+------------------+----------+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----+-----+
|songplay_id|start_time             |user_id|level|song_id           |artist_id         |session_id|location                               |user_agent                                                                                                                               |year|month|
+-----------+---

DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: null, month: null]