<h1>Data exploration with spark</h1>
This is just to get taste of how the data looks and what formats it uses.


In [1]:
print('test')

test


<h2> Imports </h2>

In [47]:
import os
import configparser
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, row_number, last
from pyspark.sql.window import Window

In [3]:
def create_spark_session():
    return SparkSession \
        .builder \
        .getOrCreate()

In [4]:
# config = configparser.ConfigParser()
# config.read_file(open('../src/dl.cfg'))
# os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
# os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']
spark = create_spark_session()

In [6]:
log_df = spark.read.json('../data/log_data')

In [7]:
log_df.cache()
log_df.show()



+--------------------+----------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|      auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+----------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|            Harmonia| Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|       Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|         The Prodigy| Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Su

In [8]:
log_df.printSchema()


root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [9]:
schema = StructType([
    StructField("artist_id", StringType()),
    StructField("artist_latitude", FloatType()),
    StructField("artist_location", StringType()),
    StructField("artist_longitude", FloatType()),
    StructField("artist_name", StringType()),
    StructField("duration", FloatType()),
    StructField("num_songs", IntegerType()),
    StructField("song_id", StringType()),
    StructField("title", StringType()),
    StructField("year", IntegerType())
])
song_df = spark.read.json('../data/song_data/*/*/*/*.json', schema=schema)
song_df.cache()

DataFrame[artist_id: string, artist_latitude: float, artist_location: string, artist_longitude: float, artist_name: string, duration: float, num_songs: int, song_id: string, title: string, year: int]

In [17]:
song_df.show()
song_df.printSchema()

+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                    |            null|Montserrat Caball...|511.16364|        1|SOBAYLL12A8C138AF9|Sono andati? Fing...|   0|
|AREBBGV1187FB523D2|           null|         Houston, TX|            null|Mike Jones (Featu...|173.66159|        1|SOOLYAZ12A6701F4A6|Laws Patrolling (...|   0|
|ARMAC4T1187FB3FA4C|       40.82624|   Morris Plains, NJ|       -74.47995|The Dillinger Esc...|207.77751|        1|SOBBUGU12A8C13E95D|Setting Fire to S...|2004|
|ARPBNLO1187FB3D52F|       40.7145

<h3>Processing song data</h3>

Song table
Requirements:

* song_id, title, artist_id, year, duration

In [15]:
song_table_df = song_df.select(
    song_df.song_id,
    song_df.title,
    song_df.artist_id,
    song_df.year,
    song_df.duration
)
song_table_df.show()

Artists table
Requirements:

* artist_id, name, location, lattitude, longitude

In [19]:
artists_table_df = song_df.select(
    song_df.artist_id,
    song_df.artist_name.alias('name'),
    song_df.artist_location.alias('location'),
    song_df.artist_latitude.alias('latitude'),
    song_df.artist_longitude.alias('longitude')
)
artists_table_df.show()

+------------------+--------------------+--------------------+--------+----------+
|         artist_id|                name|            location|latitude| longitude|
+------------------+--------------------+--------------------+--------+----------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                    |    null|      null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|    null|      null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|   Morris Plains, NJ|40.82624| -74.47995|
|ARPBNLO1187FB3D52F|            Tiny Tim|        New York, NY|40.71455| -74.00712|
|ARDNS031187B9924F0|          Tim Wilson|             Georgia|32.67828| -83.22295|
|ARNF6401187FB57032|   Sophie B. Hawkins|New York, NY [Man...|40.79086| -73.96644|
|ARLTWXK1187FB5A3F8|         King Curtis|      Fort Worth, TX|32.74863| -97.32925|
|ARPFHN61187FB575F6|         Lupe Fiasco|         Chicago, IL|41.88415| -87.63241|
|ARI2JSK1187FB496EF|Nick Ingman;Gavyn...|     London, England|51.50632|  -0.12714|
|ARO

<h2>Logs</h2>
song plays

In [51]:
user_id_by_ts_window = Window.partitionBy(
    col('userId'))\
    .orderBy(col('ts'))
user_id_by_ts_window_ranged = Window.partitionBy(
    col('userId'))\
    .orderBy(col('ts'))\
    .rangeBetween(Window.unboundedPreceding, Window.currentRow)
next_song_log_df = log_df.filter(col('page') == 'NextSong')
next_song_log_df = next_song_log_df\
    .withColumn('user_row_num', row_number().over(user_id_by_ts_window))\
    .withColumn('firstName', last('firstName').over(user_id_by_ts_window_ranged))\
    .withColumn('lastName', last('lastName').over(user_id_by_ts_window_ranged))\
    .withColumn('gender', last('gender').over(user_id_by_ts_window_ranged))\
    .withColumn('level', last('level').over(user_id_by_ts_window_ranged))\
    .select('lastName', 'gender', 'level', 'userid','user_row_num')\
    .where(col('user_row_num') == 1)
next_song_log_df.show(1000, False)

+---------+------+-----+------+------------+
|lastName |gender|level|userid|user_row_num|
+---------+------+-----+------+------------+
|Burke    |F     |free |51    |1           |
|Jordan   |F     |free |7     |1           |
|Koch     |F     |paid |15    |1           |
|Cook     |M     |free |54    |1           |
|Fox      |M     |free |101   |1           |
|Porter   |F     |free |11    |1           |
|Lynch    |F     |free |29    |1           |
|Simpson  |F     |free |69    |1           |
|Barrett  |M     |paid |42    |1           |
|Klein    |M     |paid |73    |1           |
|Lee      |M     |free |87    |1           |
|Calhoun  |F     |free |64    |1           |
|Valdez   |M     |free |3     |1           |
|Watkins  |F     |paid |30    |1           |
|Ayala    |F     |free |34    |1           |
|Cooper   |F     |free |59    |1           |
|Summers  |F     |free |8     |1           |
|Wilson   |F     |free |22    |1           |
|West     |M     |free |28    |1           |
|Young    

In [33]:
# next_song_log_df.where(col('auth') != 'Logged In').show()


+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+
|artist|auth|firstName|gender|itemInSession|lastName|length|level|location|method|page|registration|sessionId|song|status| ts|userAgent|userId|
+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+
+------+----+---------+------+-------------+--------+------+-----+--------+------+----+------------+---------+----+------+---+---------+------+

