In [1]:
!pip install spark-nlp==1.7.3
!java -version
!python --version

Collecting spark-nlp==1.7.3
  Downloading spark_nlp-1.7.3-py2.py3-none-any.whl (72.8 MB)
[K     |████████████████████████████████| 72.8 MB 78 kB/s s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-1.7.3
openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)
Python 3.7.6


In [2]:
import psycopg2
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType


In [3]:
def create_spark_session():
    """
        Create or load a Spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

spark = create_spark_session()

In [4]:
def check_parquet(parquet_path):
    ! ls 2>&1 -lh $parquet_path | head -10
    ! echo 'Parquet Files:' $(ls | wc -l)
    table_parquet = spark.read.parquet(parquet_path)
    print('DataFrame rows: %d' % table_parquet.count())
    print('DataFrame schema: %s' % table_parquet)
    table_parquet.show(5, False)
    return table_parquet

In [5]:
parquet_path = 'output/songplays_table'
songplays_table = check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 27 15:35 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 27 15:35 year=2018
Parquet Files: 15
DataFrame rows: 21
DataFrame schema: DataFrame[songplay_id: bigint, start_time: timestamp, user_id: bigint, level: string, song_id: string, artist_id: string, session_id: string, location: string, user_agent: string, year: int, month: int]
+-----------+-----------------------+-------+-----+------------------+------------------+----------+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----+-----+
|songplay_id|start_time             |user_id|level|song_id           |artist_id         |session_id|location                               |user_agent                                                                                                                               |year|month|
+-----------+-----------------------+---

In [6]:
parquet_path = 'output/time_table'
time_table = check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users    0 Mar 27 15:35 _SUCCESS
drwxr-xr-x 3 anthelix users 4.0K Mar 27 15:35 year=2018
Parquet Files: 15
DataFrame rows: 6813
DataFrame schema: DataFrame[start_time: timestamp, hour: int, day: int, week: int, weekday: string, year: int, month: int]
+-----------------------+----+---+----+-------+----+-----+
|start_time             |hour|day|week|weekday|year|month|
+-----------------------+----+---+----+-------+----+-----+
|2018-11-10 17:50:52.796|17  |10 |45  |7      |2018|11   |
|2018-11-10 18:15:27.796|18  |10 |45  |7      |2018|11   |
|2018-11-10 19:08:52.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:10:48.796|19  |10 |45  |7      |2018|11   |
|2018-11-10 19:14:26.796|19  |10 |45  |7      |2018|11   |
+-----------------------+----+---+----+-------+----+-----+
only showing top 5 rows



In [7]:
parquet_path = 'output/songs_table'
songs_table = check_parquet(parquet_path)

total 84K
-rw-r--r--  1 anthelix users    0 Mar 27 15:34 _SUCCESS
drwxr-xr-x 43 anthelix users 4.0K Mar 27 15:34 year=0
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1961
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1964
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1969
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1972
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1982
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1984
drwxr-xr-x  3 anthelix users 4.0K Mar 27 15:34 year=1985
Parquet Files: 15
DataFrame rows: 71
DataFrame schema: DataFrame[song_id: string, title: string, duration: float, year: int, artist_id: string]
+------------------+----------------------------------------------------+---------+----+------------------+
|song_id           |title                                               |duration |year|artist_id         |
+------------------+----------------------------------------------------+---------+----+------------------+
|SOAOIBZ12AB0

In [8]:
parquet_path = 'output/users_table'
users_table = check_parquet(parquet_path)

total 4.0K
-rw-r--r-- 1 anthelix users 3.5K Mar 27 15:35 part-00000-2b27c1a1-41ea-4026-9bff-e2c83f8e34a5-c000.snappy.parquet
-rw-r--r-- 1 anthelix users    0 Mar 27 15:35 _SUCCESS
Parquet Files: 15
DataFrame rows: 96
DataFrame schema: DataFrame[user_id: bigint, first_name: string, last_name: string, gender: string, level: string]
+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|2      |Jizelle   |Benjamin |F     |free |
|3      |Isaac     |Valdez   |M     |free |
|4      |Alivia    |Terrell  |F     |free |
|5      |Elijah    |Davis    |M     |free |
|6      |Cecilia   |Owens    |F     |free |
+-------+----------+---------+------+-----+
only showing top 5 rows



In [9]:
parquet_path = 'output/artists_table'
artists_table = check_parquet(parquet_path)

total 276K
-rw-r--r-- 1 anthelix users 1.3K Mar 27 15:34 part-00000-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 27 15:34 part-00001-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 27 15:34 part-00002-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.7K Mar 27 15:34 part-00003-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 27 15:34 part-00004-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.4K Mar 27 15:34 part-00005-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 27 15:34 part-00006-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.6K Mar 27 15:34 part-00007-794f61a5-648a-4b82-8c20-0837d0471a1f-c000.snappy.parquet
-rw-r--r-- 1 anthelix users 1.5K Mar 27 15:34 part-00008-794f61a5-648a-4b82-8

## Make some queries directly to the parquet files. 

In [10]:
parquet_path = 'output/time_table'
parq_songplays = spark.read.parquet(parquet_path)
parq_songplays.createOrReplaceTempView("tmp_time")
df = spark.sql("select * from tmp_time where year='2018' and month >= 10")
df.show(5)

+--------------------+----+---+----+-------+----+-----+
|          start_time|hour|day|week|weekday|year|month|
+--------------------+----+---+----+-------+----+-----+
|2018-11-10 17:50:...|  17| 10|  45|      7|2018|   11|
|2018-11-10 18:15:...|  18| 10|  45|      7|2018|   11|
|2018-11-10 19:08:...|  19| 10|  45|      7|2018|   11|
|2018-11-10 19:10:...|  19| 10|  45|      7|2018|   11|
|2018-11-10 19:14:...|  19| 10|  45|      7|2018|   11|
+--------------------+----+---+----+-------+----+-----+
only showing top 5 rows



In [11]:
parquet_path = 'output/time_table'
parq_songplays = spark.read.parquet(parquet_path)
parq_songplays.createOrReplaceTempView("tmp_time")
df = spark.sql("select * from tmp_time where year='2018' and month >= 10")
df.show(5)

+--------------------+----+---+----+-------+----+-----+
|          start_time|hour|day|week|weekday|year|month|
+--------------------+----+---+----+-------+----+-----+
|2018-11-10 17:50:...|  17| 10|  45|      7|2018|   11|
|2018-11-10 18:15:...|  18| 10|  45|      7|2018|   11|
|2018-11-10 19:08:...|  19| 10|  45|      7|2018|   11|
|2018-11-10 19:10:...|  19| 10|  45|      7|2018|   11|
|2018-11-10 19:14:...|  19| 10|  45|      7|2018|   11|
+--------------------+----+---+----+-------+----+-----+
only showing top 5 rows



In [12]:
parquet_path = 'output/songplays_table'
parq_songplays = spark.read.parquet(parquet_path)
parq_songplays.createOrReplaceTempView("tmp_songplay")
df = spark.sql("select * from tmp_songplay where year='2018' and month >= 10")
df.show(5)

+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|songplay_id|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|year|month|
+-----------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+----+-----+
|          1|2018-11-21 21:56:...|     15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|2018|   11|
| 8589934592|2018-11-24 03:48:...|     88| paid|SONWXQJ12A8C134D94|ARNF6401187FB57032|       888|Sacramento--Rosev...|"Mozilla/5.0 (Mac...|2018|   11|
|17179869187|2018-11-17 15:37:...|     73| paid|SOBONFF12A6D4F84D8|ARIK43K1187B9AE54C|       518|Tampa-St. Petersb...|"Mozilla/5.0 (Mac...|2018|   11|
| 8589934598|2018-11-26 17:57:...|     29| paid|SOQVMXR12A81C21483|ARKULSX1187FB45F84|       9

## Business Questions

* What are the busiest days of the week?
* What are the busiest times of the day?
* What are the top songs?

In [13]:
from math import pi
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap, cumsum
from bokeh.palettes import Paired12

output_notebook()

### What are the busiest days of the week?

In [14]:
parquet_path = 'output/songplays_table'
parq_songplays = spark.read.parquet(parquet_path)
parq_songplays.createOrReplaceTempView('tmp_songplays')

parquet_path = 'output/songs_table'
parq_song = spark.read.parquet(parquet_path)
parq_song.createOrReplaceTempView('tmp_song')

In [15]:

sql_query = "SELECT date_format(start_time, 'EEEE') as day, count(*) as count " \
            "FROM tmp_songplays " \
            "WHERE session_id NOT LIKE 'NONE' AND session_id NOT LIKE 'Adjustment' " \
            "GROUP BY day " \
            "ORDER BY count DESC " \
            "LIMIT 10"
df1 = spark.sql(sql_query)
df1.show(10, False)

+---------+-----+
|day      |count|
+---------+-----+
|Saturday |6    |
|Wednesday|4    |
|Monday   |4    |
|Tuesday  |3    |
|Thursday |3    |
|Sunday   |1    |
+---------+-----+



In [16]:
data = df1.toPandas()
tooltips = [('day', '@day'), ('count', '@{count}{,}')]
days = data['day'].tolist()
color_map = factor_cmap(field_name='day', palette=Paired12, factors=days)

data['angle'] = data['count'] / data['count'].sum() * 2 * pi
plot = figure(plot_height=450,
              plot_width=700,
              title='Items Sold/Day',
              tooltips=tooltips,
              x_range=(-0.5, 1.0))
plot.wedge(x=0,
           y=1,
           radius=0.4,
           start_angle=cumsum('angle', include_zero=True),
           end_angle=cumsum('angle'),
           line_color='white',
           fill_color=color_map,
           legend_field='day',
           source=data)
plot.axis.axis_label = None
plot.axis.visible = False
plot.grid.grid_line_color = None

show(plot)

### What are the busiest times of the day?

In [17]:
def time_increment(h, m):
    """
    Calculates a 30-minute time increment
    """
    increment = (int(m) * (100 / 60)) / 100  # 0.0000 - 0.9833
    increment = round(increment, 0)  # 0.0 or 1.0
    increment = int(increment) * 30  # 0 or 30
    increment = str(h).rjust(2, '0') + ':' + str(increment).rjust(2, '0')
    return increment  # i.e. '07:30' or '23:00'


spark.udf.register("udfTimeIncrement", time_increment, StringType())
sql_query = "WITH tmp_table AS (" \
            "  SELECT udfTimeIncrement(date_format(start_time, 'HH'), date_format(start_time, 'mm')) as period, count(*) as count " \
            "  FROM tmp_songplays " \
            "  WHERE user_id NOT LIKE 'NONE' AND user_id NOT LIKE 'Adjustment' " \
            "  GROUP BY period " \
            "  ORDER BY period ASC" \
            ") " \
            "SELECT period, count " \
            "FROM tmp_table " \
            "WHERE period BETWEEN '05:00' AND '23:59'"

df2 = spark.sql(sql_query)
df2.show(10, False)

+------+-----+
|period|count|
+------+-----+
|07:30 |2    |
|09:30 |1    |
|12:30 |1    |
|13:00 |1    |
|15:30 |3    |
|16:00 |1    |
|17:30 |1    |
|18:30 |1    |
|20:00 |4    |
|20:30 |2    |
+------+-----+
only showing top 10 rows



In [18]:
source = ColumnDataSource(data=df2.toPandas())
tooltips = [('period', '@period'), ('count', '@{count}{,}')]
periods = source.data['period'].tolist()
plot = figure(x_range=periods,
              plot_width=900,
              plot_height=450,
              min_border=0,
              tooltips=tooltips)
plot.vbar(x='period', bottom=0, top='count', source=source, width=0.9)
plot.title.text = 'Users Connection/Hour'
plot.xaxis.axis_label = 'Hour of the Day'
plot.yaxis.axis_label = 'Total Users Connection'

show(plot)

### What are the top songs?

In [19]:
sql_query = "SELECT tsg.title, count(*) as count " \
            "FROM tmp_songplays AS tss " \
            "JOIN tmp_song AS tsg "  \
            "ON tsg.song_id=tss.song_id " \
            "WHERE tss.song_id NOT LIKE 'NONE' AND tss.song_id NOT LIKE 'Adjustment' " \
            "GROUP BY tsg.title " \
            "ORDER BY count DESC " \
            "LIMIT 10"

df3 = spark.sql(sql_query)
df3.show(10, False)

+----------------------------------------+-----+
|title                                   |count|
+----------------------------------------+-----+
|Streets On Fire (Explicit Album Version)|5    |
|Tonight Will Be Alright                 |4    |
|A Higher Place (Album Version)          |4    |
|Broken-Down Merry-Go-Round              |1    |
|Salt In NYC                             |1    |
|Setanta matins                          |1    |
|Face the Ashes                          |1    |
|The Ballad Of Sleeping Beauty           |1    |
|Harajuku Girls                          |1    |
|Der Kleine Dompfaff                     |1    |
+----------------------------------------+-----+



In [20]:
source = ColumnDataSource(data=df3.toPandas())
tooltips = [('title', '@title'), ('count', '@{count}{,}')]
items = source.data['title'].tolist()
items.reverse()
plot = figure(y_range=items,
              plot_width=750,
              plot_height=375,
              min_border=0,
              tooltips=tooltips)
plot.hbar(y='title', right='count', height=.9, source=source)
plot.title.text = 'Top title'
plot.yaxis.axis_label = 'title'
plot.xaxis.axis_label = 'Total title listen'

show(plot)