#Setup

In [None]:
!git clone https://github.com/aliswh/lastfm
!cd lastfm; pip install -r requirements.txt
!wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar
!cp gcs-connector-hadoop3-latest.jar /usr/local/lib/python3.7/dist-packages/pyspark/jars

In [None]:
!cd lastfm; git pull

In [None]:
import pyspark
from pyspark.sql import SparkSession

from lastfm.src.ingestion_layer.googlestorage import *
from lastfm.src.ingestion_layer.pylastsource import *
from lastfm.src.ingestion_layer.batchwriter import *
from lastfm.src.ingestion_layer.pysparkreader import *
from lastfm.src.ingestion_layer.config import *

from functools import reduce
from operator import add
import datetime

In [None]:
spark = SparkSession.builder.appName('A4BD Project').getOrCreate()
sc = pyspark.SparkContext.getOrCreate()
reader = PySparkReader(sc)
storage = GoogleStorageJSON('./lastfm/src/ingestion_layer/creds.json',BUCKET_NAME)

#Code

In [1]:
#@markdown Helper functions
def to_date(x):
  return datetime.datetime.strptime(x, '%d %b %Y, %H:%M').date()

def filter_date(rdd, date):
  #today = datetime.date.today()
  today = datetime.datetime(2022, 2, 21).date() # To have consistent results
  def today_f(rdd):
    return rdd.filter(lambda x: to_date(x['date']) == today)
  def yesterday_f(rdd):
    yesterday = today - datetime.timedelta(days=1)
    return rdd.filter(lambda x: to_date(x['date']) == yesterday)
  def last_week_f(rdd):
    last_week = today - datetime.timedelta(days=6)
    return rdd.filter(lambda x: to_date(x['date']) >= last_week)
  def last_month_f(rdd):
    last_month = today - datetime.timedelta(days=30)
    return rdd.filter(lambda x: to_date(x['date']) >= last_month)
  filters = {'today':today_f,
             'yesterday':yesterday_f,
             'last week':last_week_f,
             'last month':last_month_f}
  return filters[date](rdd)

def topN(x, y, idx, reverse, top):
  x = [x] if type(x)!=list else x
  y = [y] if type(y)!=list else y
  return sorted([*x,*y], reverse = reverse, key=lambda x:x[idx])[:top]

def top_hits(rdd, top):
  rdd = rdd.map(lambda x: ((x['id'],x['title'],x['artist']),1))\
           .groupByKey()\
           .mapValues(sum)\
           .map(lambda x: (x[0][1],x[0][2],x[1]))
  try:
    reduced = rdd.reduce(lambda x,y: topN(x,y,2,reverse=True,top=top))
    reduced = [reduced] if type(reduced)!=list else reduced
    return reduced
  except Exception as e:
    print(e)
    return []

def join_tracks_info(rdd, tracks_rdd):
  rdd = rdd.map(lambda x: (x['id'],x))
  tracks_rdd = tracks_rdd.map(lambda x: (x['id'],x))
  rdd = rdd.join(tracks_rdd)\
           .mapValues(lambda x: dict(list(x[0].items()) + list(x[1].items())))
  return rdd

def print_hit(hit):
  print(hit)
  print(f'Title: {hit[0]}')
  print(f'Artist: {hit[1]}')
  print(f'Count: {hit[2]}')

def top_hits_date_genre(rdd, top, date='', genre='', tracks_rdd=None, bool_print=False):
  if date != '':
    rdd = filter_date(rdd, date)
    if bool_print:
      print(f'[{date} greatest hits]')
  elif bool_print:
    print('[all time greatest hits]')

  if genre != '':
    if tracks_rdd == None:
      print('No tracks info provided')
      print()
      return
    elif bool_print:
      print(f'[{genre}]')
    rdd = join_tracks_info(rdd, tracks_rdd)
    rdd = rdd.map(lambda x: x[1])
    rdd = rdd.filter(lambda x: x['tags'] and genre in x['tags'])

  hits = top_hits(rdd, top)
  if bool_print:
    for hit in hits:
      print_hit(hit)
    print()
  return hits

def top_hits_all_dates(rdd, top=3, genres=[''], tracks_rdd=None, bool_print=False):
  dates = ['today','yesterday','last week','last month']
  if genres != ['']:
    genre_dict = {genre:{} for genre in genres}
  else:
    genre_dict = {}
  ret_hits = {date:genre_dict.copy() for date in dates}
  for date in dates:
    for genre in genres:
      res = top_hits_date_genre(rdd, top, date, genre, tracks_rdd, bool_print)
      if genres != ['']:
        ret_hits[date][genre] = res
      else:
        ret_hits[date] = res
  return ret_hits

In [None]:
#Data gathering
recent_tracks_rdd = reader.read('recent_tracks',dir=True).map(lambda x: x['recent_tracks']).flatMap(lambda x: x).cache()
tracks_rdd = reader.read('tracks',dir=True).cache()
global_top_tags_rdd = reader.read('global_top_tags',dir=False).cache()

In [None]:
#Local speed-up (doesn't work for large datasets)
recent_tracks_rdd = sc.parallelize(recent_tracks_rdd.collect())
tracks_rdd = sc.parallelize(tracks_rdd.collect())
global_top_tags_rdd = sc.parallelize(global_top_tags_rdd.collect())

In [None]:
top = 3
n_genres = 5
genres = set(map(lambda x: x[0], sorted(global_top_tags_rdd.collect()[0].items(), key = lambda x: x[1], reverse=True)[:n_genres]))

all_hits_dict = top_hits_all_dates(recent_tracks_rdd, top, bool_print=True)
genre_hits_dict = top_hits_all_dates(recent_tracks_rdd, top, genres, tracks_rdd, True)

[today greatest hits]
('Tin Pan Alley (AKA Roughest Place in Town)', 'Stevie Ray Vaughan', 5)
Title: Tin Pan Alley (AKA Roughest Place in Town)
Artist: Stevie Ray Vaughan
Count: 5
("Goin' Down South", 'Kenny Brown', 2)
Title: Goin' Down South
Artist: Kenny Brown
Count: 2
('While My Guitar Gently Weeps', 'The Beatles', 2)
Title: While My Guitar Gently Weeps
Artist: The Beatles
Count: 2

[yesterday greatest hits]
('Souled Out On You', 'Robert Finley', 6)
Title: Souled Out On You
Artist: Robert Finley
Count: 6
('Make Me Feel Alright', 'Robert Finley', 3)
Title: Make Me Feel Alright
Artist: Robert Finley
Count: 3
('Better Than I Treat Myself', 'Robert Finley', 3)
Title: Better Than I Treat Myself
Artist: Robert Finley
Count: 3

[last week greatest hits]
('Tin Pan Alley (AKA Roughest Place in Town)', 'Stevie Ray Vaughan', 6)
Title: Tin Pan Alley (AKA Roughest Place in Town)
Artist: Stevie Ray Vaughan
Count: 6
('Souled Out On You', 'Robert Finley', 6)
Title: Souled Out On You
Artist: Robert 

In [None]:
print(all_hits_dict)
print()
print(genre_hits_dict)

{'today': [('Tin Pan Alley (AKA Roughest Place in Town)', 'Stevie Ray Vaughan', 5), ("Goin' Down South", 'Kenny Brown', 2), ('While My Guitar Gently Weeps', 'The Beatles', 2)], 'yesterday': [('Souled Out On You', 'Robert Finley', 6), ('Make Me Feel Alright', 'Robert Finley', 3), ('Better Than I Treat Myself', 'Robert Finley', 3)], 'last week': [('Tin Pan Alley (AKA Roughest Place in Town)', 'Stevie Ray Vaughan', 6), ('Souled Out On You', 'Robert Finley', 6), ('I Dreamed A Dream', 'Randy Graff', 6)], 'last month': [("I Don't Want to Set the World on Fire", 'The Ink Spots', 9), ('Bacán tu casa', 'Gepe', 9), ('Goodnight My Someone', 'Shirley Jones', 9)]}

{'today': {'rock': [('Tin Pan Alley (AKA Roughest Place in Town)', 'Stevie Ray Vaughan', 5), ('While My Guitar Gently Weeps', 'The Beatles', 2), ('I Me Mine', 'The Beatles', 1)], 'indie': [('Colorvision', 'Com Truise', 1), ('Hey Now', 'London Grammar', 1)], 'seen live': [('Little Martha', 'The Allman Brothers Band', 1)], 'alternative': [

In [None]:
storage.write(all_hits_dict, 'all_greatest_hits')
storage.write(genre_hits_dict, 'genre_greatest_hits')

'https://storage.googleapis.com/a4bd-project-2.appspot.com/genre_greatest_hits'