# Set up

In [1]:
!git clone https://github.com/aliswh/lastfm
!cd lastfm; pip install -r requirements.txt

Cloning into 'lastfm'...
remote: Enumerating objects: 440, done.[K
remote: Counting objects: 100% (440/440), done.[K
remote: Compressing objects: 100% (298/298), done.[K
remote: Total 440 (delta 277), reused 289 (delta 139), pack-reused 0[K
Receiving objects: 100% (440/440), 3.13 MiB | 8.95 MiB/s, done.
Resolving deltas: 100% (277/277), done.
Collecting pylast
  Downloading pylast-4.5.0-py3-none-any.whl (25 kB)
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
Collecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 56.9 MB/s 
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=b69cbbee46efc0e3604044c8982c435ca695b5fb63d866544f5cc7c6a3f66e41
  Stored in directory: /root/.cache/pip/w

In [2]:
!wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar
!cp gcs-connector-hadoop3-latest.jar /usr/local/lib/python3.7/dist-packages/pyspark/jars

--2022-03-02 07:24:01--  https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.120.128, 142.250.128.128, 142.251.6.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.120.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31607894 (30M) [application/java-archive]
Saving to: ‘gcs-connector-hadoop3-latest.jar’


2022-03-02 07:24:02 (127 MB/s) - ‘gcs-connector-hadoop3-latest.jar’ saved [31607894/31607894]



In [3]:
import json
import datetime

import pyspark
from pyspark.sql import SparkSession
from py4j.java_gateway import UserHelpAutoCompletion
from lastfm.src.ingestion_layer.googlestorage import *
from lastfm.src.ingestion_layer.pylastsource import *
from lastfm.src.ingestion_layer.config import *
from lastfm.src.ingestion_layer.batchwriter import *
from lastfm.src.ingestion_layer.pysparkreader import *

# PySpark Context onto Google Cloud Storage

In [4]:
spark = SparkSession.builder.appName('A4BD Project').getOrCreate()
sc = pyspark.SparkContext.getOrCreate()
reader = PySparkReader(sc)
source = PyLastSource(AUTH_DATA) # for tests

In [5]:
storage = GoogleStorageJSON('./lastfm/src/ingestion_layer/creds.json',BUCKET_NAME)
writer = BatchWriter(source, storage)
# writer.write('test', SEED_USER, 3, 30, debug=True)

# Read from storage

In [6]:
recent_tracks_rdd = reader.read('recent_tracks',dir=True)
recent_tracks_rdd.count()

10

In [7]:
tracks_rdd = reader.read('tracks',dir=True)
tracks_rdd.count()

2132

In [8]:
artists_rdd = reader.read('artists',dir=True)
artists_rdd.count()

1246

# User features

1. Extract listening sessions from users:
  * list of concatenated songs with a play events +-10 sec after the actual song completed
2. Know statistics about these sessions:
  * average number of tracks
  * average session per user
  * session lengths

## Extract listening sessions

Preprocessing

In [9]:
def to_datetime(x):
  timestamp_date = lambda ts: datetime.fromtimestamp(int(ts))
  date_date = lambda ts: datetime.strptime(ts, '%d %b %Y, %H:%M')
  x[1]['date'] = date_date(x[1]['date'])
  x[1]['timestamp'] = timestamp_date(x[1]['timestamp'])
  return x

sessions = recent_tracks_rdd.map(lambda x: (x['user'], x['recent_tracks'])) \
                            .flatMap(lambda x: map(lambda val: (x[0], val), x[1])) \
                            .map(lambda x: to_datetime(x))
                            
sessions.take(1)

[('kurtphyre',
  {'album': "She Ain't Here: A Tribute to R.L. Burnside",
   'artist': 'The Juke Joint Highball',
   'date': datetime.datetime(2022, 2, 21, 18, 20),
   'id': '7605616a40ba65675bc43e4c3eb6954eaecfc420',
   'timestamp': datetime.datetime(2022, 2, 21, 18, 20, 35),
   'title': "Goin' Down South"})]

We keep only the `timestamp` because it gives more information w.r.t. `date`, which doesn't include the seconds.

Compute the elapsed time between each song on the sessions as the difference between timestamps (in milliseconds). Keep only the minimal value and the title and artist of the first song: this way, we get the track and the *time it passed from its starting time till the next track*.



In [10]:
def timestamp_to_delta(ts1, ts2):
  return int((ts2 - ts1).total_seconds() * 1000)

sessions = sessions.join(sessions) \
                   .filter(lambda x: x[1][0]['date'] < x[1][1]['date']) \
                   .map(lambda x: ((x[0], x[1][0]['title'], x[1][0]['artist'], x[1][0]['timestamp']), timestamp_to_delta(x[1][0]['timestamp'], x[1][1]['timestamp']))) \
                   .reduceByKey(lambda x,y: min(x,y))

sessions.take(1)  

[(('abernes',
   'Turandot: Nessun dorma',
   'Thomas Harper',
   datetime.datetime(2021, 2, 22, 23, 8, 48)),
  135000)]

In [11]:
sessions = sessions.map(lambda x: ((x[0][1], x[0][2]), (x[0][0], x[0][3], x[1])))
sessions.take(1)

[(('Turandot: Nessun dorma', 'Thomas Harper'),
  ('abernes', datetime.datetime(2021, 2, 22, 23, 8, 48), 135000))]

Now take the duration of each track to get, in the end, the difference between the elapsed time between two tracks and the duration of the first: 
* if the difference is `0`, it means the track has been played and another track started immediately after
* if the difference is `< 0`, it means the track has not been played in full
* if the difference is `> 0`, it means that some time passed after the track has been played in full, before getting to the next song. *This is what we are looking for.* If this value is bigger than some threshold, we consider the session finished. 

In [12]:
tr_rdd = tracks_rdd.map(lambda x: ((x['title'],x['artist']),x['duration']))
tr_rdd.take(10)

[(('Nightlight', 'Illenium'), 222000),
 (('Let Em Know', "ill' j"), 154000),
 (('Dlaczego nic', 'IRA'), 237000),
 (('Lida Rose / Will I Ever Tell You?', 'Shirley Jones'), 256000),
 (('Mr DG', 'Colman Brothers'), 256000),
 (('Senbonzakura', 'WagakkiBand'), 0),
 (('CZY TU JEST NASZ DOM?', 'blurred pink'), 277000),
 (('Que alegría', 'NVSCVR'), 152000),
 (('Mobbin', 'Grandtheft'), 186000),
 (('Ekans', 'Molife'), 141000)]

In [13]:
sessions_tracks = sessions.join(tr_rdd)\
                          .mapValues(lambda x: (x[0][0], x[0][1], x[1], x[0][2]))
sessions_tracks.take(10) # sx from listening session, dx from track

[(('Elegia', 'New Order'),
  ('stephiesama', datetime.datetime(2017, 8, 27, 20, 21, 50), 294000, 297000)),
 (('Te Desheredo', 'Los Tres'),
  ('Frax777', datetime.datetime(2022, 2, 9, 2, 20, 10), 288000, 207000)),
 (('Krywaniu', 'Gooral'),
  ('Gunthar666', datetime.datetime(2022, 2, 11, 9, 46, 24), 240000, 242000)),
 (('tempo futuro', 'Nicola Conte'),
  ('jumd', datetime.datetime(2022, 2, 17, 12, 46, 41), 277000, 148000)),
 (("Don't Matter Now", 'Abby Anderson'),
  ('abernes', datetime.datetime(2021, 2, 1, 5, 26, 24), 223000, 159000)),
 (('solo', 'Nicola Conte'),
  ('jumd', datetime.datetime(2022, 2, 16, 18, 27, 8), 147000, 147000)),
 (('Gospel (with Eminem)', 'Dr. Dre'),
  ('Gunthar666', datetime.datetime(2022, 2, 15, 11, 36, 46), 0, 211000)),
 (('Gospel (with Eminem)', 'Dr. Dre'),
  ('Gunthar666', datetime.datetime(2022, 2, 13, 7, 43, 53), 0, 211000)),
 (('Wild', 'Missy Lancaster'),
  ('abernes', datetime.datetime(2021, 2, 1, 6, 11, 12), 208000, 218000)),
 (("Tarantino's Tango", 'Gare

In [14]:
tr_rdd.filter(lambda x: x[1] == 0).take(3)

[(('Senbonzakura', 'WagakkiBand'), 0),
 (('Chopin: Piano Concerto No. 2 in F Minor, Op. 21 - III. Allegro vivace',
   'Benjamin Grosvenor, Royal Scottish National Orchestra, Elim Chan'),
  0),
 (('Often', 'Adas'), 0)]

**Problem**: missing duration for some tracks

**Fix**: take max duration from the tracks by that artist in the listening session to fill missing values.

This could still produce some imprecision, but we need to put a patch.

In [15]:
artist_max = sessions_tracks.map(lambda x: (x[0][1], x[1][2])) \
                            .reduceByKey(lambda x,y: max(x,y))
artist_max.take(10)

[('Volbeat', 374000),
 ('Carpenter Brut', 403000),
 ('The Hellacopters', 0),
 ('senio', 214000),
 ('Pedropiedra', 232000),
 ('Brandon Ray', 186000),
 ('Tommy Guerrero & Gadget', 207000),
 ('Bethel', 217000),
 ('Alice et Moi', 0),
 ('Anne-Sophie Mutter, Yo-Yo Ma, Daniel Barenboim, West-Eastern Divan Orchestra',
  316000)]

Some artist still don't have a duration value, but if they don't have it in the recent tracks, maybe they have in their top tracks! 

This could still not work, so we put a default value, like the average of all values for the tracks listened by that user in the recent tracks, because we presume that a user usually listens to songs of similar length.

In [16]:
artist_avg = artist_max.filter(lambda x: x[1] != 0).map(lambda x: (x[1],1)).reduce(lambda x,y: ((x[0]+y[0]),(x[1]+y[1])) )
artist_avg = artist_avg[0] // artist_avg[1]
artist_avg

248854

In [17]:
artist_max = artists_rdd.map(lambda x: (x['artist'], max([int(track['duration']) for track in x['top_tracks'] ]) )) \
                        .join(artist_max) \
                        .mapValues(lambda x: x[1] if x[1] != 0 else artist_avg)
artist_max.take(10)

[('Gianluca', 190000),
 ('Elöhim', 248854),
 ('Tim McGraw, Tyler Hubbard', 176000),
 ('Kamil Kowalski', 253000),
 ('Stevie Ray Vaughan and Double Trouble', 199000),
 ('DJ Antoine', 248854),
 ('Kid Francescoli', 391000),
 ('Stubby Kaye', 135000),
 ('Jerome The Prince', 150000),
 ('Snavs', 195000)]

Now merge duration per track in `tr_rdd`.

In [18]:
fixed_tr_rdd = tr_rdd.map(lambda x: (x[0][1], (x[0][0], x[1]))) \
                     .join(artist_max) \
                     .map(lambda x: ((x[1][0][0],x[0]), (x[1][0][1], x[1][1]))) 
fixed_tr_rdd.take(10)

[(('La tête dans les étoiles', 'Doctor Flake'), (257000, 257000)),
 (('Feeling Better', 'Gabriel Vitel'), (380000, 380000)),
 (('Feeling Better - Radio Version', 'Gabriel Vitel'), (0, 380000)),
 (("Don't Stop", 'Going Deeper'), (165000, 165000)),
 (('With the Moon Behind', 'Sleepy Clouds'), (213000, 213000)),
 (('Hear My Call (feat. Tyler Graves)', 'Trivecta'), (0, 248854)),
 (('Symphony No. 5: IV. Adagietto: Sehr langsam',
   'Polish National Radio Symphony Orchestra'),
  (0, 248854)),
 (('Two Hundred Grand', 'Sunshine'), (255000, 255000)),
 (('Always', 'Gary McPhail'), (0, 248854)),
 (('Levi Denim', 'Chase Martin'), (151000, 151000))]

In [19]:
fixed_tr_rdd = fixed_tr_rdd.mapValues(lambda x: x[0] if x[0] != 0 else x[1])
fixed_tr_rdd.take(10)

[(('La tête dans les étoiles', 'Doctor Flake'), 257000),
 (('Feeling Better', 'Gabriel Vitel'), 380000),
 (('Feeling Better - Radio Version', 'Gabriel Vitel'), 380000),
 (("Don't Stop", 'Going Deeper'), 165000),
 (('With the Moon Behind', 'Sleepy Clouds'), 213000),
 (('Hear My Call (feat. Tyler Graves)', 'Trivecta'), 248854),
 (('Symphony No. 5: IV. Adagietto: Sehr langsam',
   'Polish National Radio Symphony Orchestra'),
  248854),
 (('Two Hundred Grand', 'Sunshine'), 255000),
 (('Always', 'Gary McPhail'), 248854),
 (('Levi Denim', 'Chase Martin'), 151000)]

Finally compute the listening sessions.

In [20]:
sessions_tracks = sessions.join(fixed_tr_rdd)\
                          .mapValues(lambda x: (x[0][0], x[0][1], x[1], x[0][2]))
sessions_tracks.take(10) # sx from listening session, dx from track

[(('Silence (Feat. Sarah McLachlan) [Alyx Ander vs. Delerium] [Radio Edit]',
   'Delerium'),
  ('23kulpamens', datetime.datetime(2020, 1, 27, 9, 23, 13), 219000, 203000)),
 (('Silence (Feat. Sarah McLachlan) [Alyx Ander vs. Delerium] [Radio Edit]',
   'Delerium'),
  ('23kulpamens', datetime.datetime(2020, 1, 25, 9, 47, 5), 219000, 399000)),
 (('Silence (Feat. Sarah McLachlan) [Alyx Ander vs. Delerium] [Radio Edit]',
   'Delerium'),
  ('23kulpamens', datetime.datetime(2020, 1, 25, 9, 46, 32), 219000, 33000)),
 (('Vertical', 'Adrian Stresow'),
  ('junjindua', datetime.datetime(2022, 2, 14, 16, 23, 39), 192000, 194000)),
 (('Vertical', 'Adrian Stresow'),
  ('junjindua', datetime.datetime(2022, 2, 17, 11, 38, 55), 192000, 194000)),
 (('Vertical', 'Adrian Stresow'),
  ('junjindua', datetime.datetime(2022, 2, 17, 11, 42, 9), 192000, 192000)),
 (('Vertical', 'Adrian Stresow'),
  ('junjindua', datetime.datetime(2022, 2, 13, 17, 50, 10), 192000, 193000)),
 (('Vertical', 'Adrian Stresow'),
  ('j

In [21]:
to_min = lambda x: divmod(x//1000, 60)
to_date_string = lambda x: x.strftime("%Y-%m-%d, %H:%M:%S")

sessions_tracks = sessions_tracks.mapValues(lambda x: (x[0], (x[2]-x[3])//1000, to_min(x[2]), to_date_string(x[1]))) \
                                 .groupBy(lambda x: x[1][0]) \
                                 .map(lambda x : (x[0], list(x[1])))
sessions_tracks.take(1)

[('Frax777',
  [(('Stadium Arcadium', 'Red Hot Chili Peppers'),
    ('Frax777', 0, (5, 15), '2022-02-09, 22:29:34')),
   (('Dilo', 'Bronko Yotte'),
    ('Frax777', -597, (4, 24), '2022-02-11, 00:17:25')),
   (('Dilo', 'Bronko Yotte'), ('Frax777', 0, (4, 24), '2022-02-11, 00:13:00')),
   (('OK', 'Bronko Yotte'), ('Frax777', 60, (3, 38), '2022-02-11, 00:03:51')),
   (('OK', 'Bronko Yotte'), ('Frax777', -1, (3, 38), '2022-02-11, 00:00:11')),
   (('Este Año No Hay Cosecha', 'Pikette23'),
    ('Frax777', 0, (2, 50), '2022-02-04, 00:04:51')),
   (('Este Año No Hay Cosecha', 'Pikette23'),
    ('Frax777', -7, (2, 50), '2022-02-09, 23:18:23')),
   (('Este Año No Hay Cosecha', 'Pikette23'),
    ('Frax777', -59, (2, 50), '2022-02-04, 00:07:41')),
   (('Spoiler', 'Diego Lorenzini'),
    ('Frax777', 1, (3, 21), '2022-02-10, 14:06:51')),
   (('Spoiler', 'Diego Lorenzini'),
    ('Frax777', 0, (3, 21), '2022-02-10, 13:33:25')),
   (('Dejando Libre el Amor', 'Chancho En Piedra'),
    ('Frax777', -1, (4

if the difference is > 0, it means that some time passed after the track has been played in full, before getting to the next song. This is what we are looking for. If this value is bigger than some threshold, we consider the session finished.

If the song has a value `> 0 + threshold` (let's say 15 seconds), it means that it is the last song of the listening session, so from the next song we create a new listening session.

In [22]:
threshold = 15 # seconds

def get_sessions_per_user(x):
  #all_tracks = sorted(x[1], key=lambda x: x[1][3])
  all_tracks = x[1]
  sessions = []
  temp = []
  for track in all_tracks:
    temp.append(track)
    if track[1][1] > threshold:
      sessions.append(temp)
      temp = []
  return sessions

user_sessions = sessions_tracks.map(lambda x: (x[0],get_sessions_per_user(x)))
user_sessions.take(20)

[('Frax777',
  [[(('Stadium Arcadium', 'Red Hot Chili Peppers'),
     ('Frax777', 0, (5, 15), '2022-02-09, 22:29:34')),
    (('Dilo', 'Bronko Yotte'),
     ('Frax777', -597, (4, 24), '2022-02-11, 00:17:25')),
    (('Dilo', 'Bronko Yotte'),
     ('Frax777', 0, (4, 24), '2022-02-11, 00:13:00')),
    (('OK', 'Bronko Yotte'),
     ('Frax777', 60, (3, 38), '2022-02-11, 00:03:51'))],
   [(('OK', 'Bronko Yotte'), ('Frax777', -1, (3, 38), '2022-02-11, 00:00:11')),
    (('Este Año No Hay Cosecha', 'Pikette23'),
     ('Frax777', 0, (2, 50), '2022-02-04, 00:04:51')),
    (('Este Año No Hay Cosecha', 'Pikette23'),
     ('Frax777', -7, (2, 50), '2022-02-09, 23:18:23')),
    (('Este Año No Hay Cosecha', 'Pikette23'),
     ('Frax777', -59, (2, 50), '2022-02-04, 00:07:41')),
    (('Spoiler', 'Diego Lorenzini'),
     ('Frax777', 1, (3, 21), '2022-02-10, 14:06:51')),
    (('Spoiler', 'Diego Lorenzini'),
     ('Frax777', 0, (3, 21), '2022-02-10, 13:33:25')),
    (('Dejando Libre el Amor', 'Chancho En Pie

## Statistics about listening sessions

average number of tracks

average session per user

session lengths

In [23]:
def avg_tracks(x):
  r,c = 0, 1
  for y in x:
    r += len(y)
    c += 1
  return round(r/c, 3)

user_avg_tracks = user_sessions.map(lambda x: (x[0], avg_tracks(x[1])))
user_avg_tracks.take(10)

[('Frax777', 5.246),
 ('WhiteLegioN', 3.082),
 ('jumd', 3.773),
 ('kurtphyre', 2.22),
 ('stephiesama', 2.931),
 ('abernes', 2.136),
 ('Gunthar666', 6.614),
 ('junjindua', 2.838),
 ('jreisner', 5.259),
 ('23kulpamens', 2.56)]

In [24]:
def avg_session(x):
  r,c = 0, 1
  for y in x:
    r += len(x)
    c += 1
  return round(r/c, 3)

user_avg_session = user_sessions.map(lambda x: (x[0], avg_session(x[1])))
user_avg_session.take(10)

[('Frax777', 55.018),
 ('WhiteLegioN', 95.01),
 ('jumd', 73.013),
 ('kurtphyre', 130.008),
 ('stephiesama', 100.01),
 ('abernes', 138.007),
 ('Gunthar666', 42.023),
 ('junjindua', 103.01),
 ('jreisner', 52.019),
 ('23kulpamens', 114.009)]

In [25]:
to_sec = lambda x: x[0]*60+x[1]

def avg_session_len(x):
  r,c = 0, 1
  for y in x:
    r += to_sec(y[0][1][2])
    c += 1
  return round(r/c, 3)

user_avg_session_len = user_sessions.map(lambda x: (x[0], avg_session_len(x[1])))
user_avg_session_len.take(10)

[('Frax777', 245.421),
 ('WhiteLegioN', 220.866),
 ('jumd', 264.173),
 ('kurtphyre', 284.167),
 ('stephiesama', 288.794),
 ('abernes', 234.35),
 ('Gunthar666', 238.705),
 ('junjindua', 206.514),
 ('jreisner', 225.426),
 ('23kulpamens', 347.259)]

# Save results to Data Lake

In [28]:
user_sessions_collect = user_sessions.collect()

user_sessions_stats = {
    'user_avg_tracks': user_avg_tracks.collect(),
    'user_avg_session': user_avg_session.collect(),
    'user_avg_session_len' : user_avg_session_len.collect()
    }

In [29]:
dest_path = 'listening_sessions' + '/'
ls_path = dest_path + 'listening_sessions' + '/'
ls_stats_path = dest_path + 'listening_sessions_statistics' + '/'

for key,value in user_sessions_stats.items():
  storage.write(value, ls_stats_path+f"listening_sessions_{key}")

for user in user_sessions_collect:
  storage.write(user, ls_path+f"listening_sessions_{user[0]}")