In [None]:
import os
import json
import pandas as pd
import logging
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from dateutil import parser
import requests
from typing import Literal
import itertools
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame as pyspark_df

In [None]:
class ExtractInfo:

    def __init__(self, endpoint: Literal['history', 'charts'] = 'history'):
        self.endpoint = endpoint
        self.process_date = datetime.now()
        self.interval = None
        self.period = None
        self.start_date = None
        self.end_date = None
        self.chart_id = itertools.count(start=1)
        self.spark = spark

        with open('/tmp/datalake/deezer/deezer_credentials.json') as f:
            deezer_credentials = json.load(f)

        self.user_id = deezer_credentials['user_id']
        self.access_token = deezer_credentials['access_token']

    def date_limit(self, interval: Literal['days', 'weeks', 'month'] = 'days', period: int = 1, start_date: str | None = None, end_date: str | None = None):

        self.interval = interval
        self.period = period
        self.start_date = start_date
        self.end_date = end_date

        kwargs_relativedelta = {interval: period}
        diff_dt = relativedelta(**kwargs_relativedelta)

        if start_date is None and end_date is None:
            return self.process_date - diff_dt, self.process_date
        elif start_date is not None and end_date is None:
            return parser.parse(start_date) + diff_dt, self.process_date
        elif start_date is None and end_date is not None:
            return parser.parse('2000-01-01'), parser.parse(end_date) - diff_dt
        elif start_date is not None and end_date is not None:
            if start_date <= end_date:
                return parser.parse(start_date), parser.parse(end_date)
            else:
                print('Data inicial maior que a data final, filtro padrão será aplicado')
                return self.process_date - diff_dt, self.process_date
            
    def _filter_date(self, page_data: list):

        lower_limit, upper_limit = self.date_limit()

        return [*filter(lambda x: datetime.fromtimestamp(x['timestamp']) >= lower_limit 
                                  and datetime.fromtimestamp(x['timestamp']) <= upper_limit, 
                        page_data)]
    
    def _add_data(self, page_data: list):
        return [*map(lambda x: {**x, **{'chart': next(self.chart_id), 'timestamp': int(self.process_date.timestamp())}}, page_data)]
    
    def _process_payload(self, payload: list):
        if self.endpoint == 'history':
            return self._filter_date(page_data=payload.get('data'))
        elif self.endpoint == 'charts':
            return self._add_data(page_data=payload.get('data'))
        else:
            return payload.get('data')

    def _fetch_page(self, next_page: str | None = None):

        if next_page is None:
            api_url = f'https://api.deezer.com/user/{self.user_id}/{self.endpoint}'
            response = requests.get(
                api_url, 
                params = {'access_token': self.access_token})
        else:
            response = requests.get(next_page)

        payload = response.json()

        return payload
    
    def _fetch_endpoint(self, next_page: str | None = None):

        payload = self._fetch_page(next_page=next_page)
        payload_processed = self._process_payload(payload=payload)

        yield from payload_processed

        next_page = payload.get('next')

        if next_page:
            yield from self._fetch_endpoint(next_page)
    
    def endpoint_data(self) -> list:

        return [page_data for page_data in self._fetch_endpoint()]
    
    def summarize(self, data: list | None = None) -> pyspark_df:
        if data is None:
            data = self.endpoint_data()

        json_strings = [*map(lambda x: json.dumps(x), data)]
        dataRDD = spark.sparkContext.parallelize(json_strings)
            
        return spark.read.json(dataRDD)
    
    def save(self, sdf: pyspark_df | None = None, file_path: str = 'datalake/deezer/user', file_format: str = 'parquet') -> None:
        file_path = f'{file_path}/{self.endpoint}/{file_format}'
        file_suffix = self.process_date.strftime('%Y_%m_%d_%H_%M_%S')
        file_name = f'user_{self.endpoint}_{file_suffix}.{file_format}'

        if sdf is None:
            sdf = self.summarize()

        (
            sdf
            .coalesce(1)
            .write
            .format(file_format)
            .mode('overwrite')
            .option("overwriteSchema", "true")
            .save(f'dbfs:/{file_path}/{file_name}')
        )

        # the read must have the option recursiseFileLookup as true because a name for the parquet file was set
        # (
        #     spark
        #     .read
        #     .format('parquet')
        #     .option("recursiveFileLookup","true")
        #     .load('dbfs:/{file_path}')
        # )

        # Another option is use the mode = 'append' and not specify the a file_name
        # (
        #     sdf
        #     .coalesce(1)
        #     .write
        #     .format(file_format)
        #     .mode('append')
        #     .option("overwriteSchema", "true")
        #     .save(f'dbfs:/{file_path}')
        # )

        # so the read can be the following 
        # (
        #     spark
        #     .read
        #     .format('parquet')
        #     .load('dbfs:/{file_path}')
        # )

In [None]:
dbfs_path = 'dbfs:/datalake/deezer/deezer_credentials.json' 
local_path = f'file:/tmp/datalake/deezer/{dbfs_path.split("/")[-1]}'
dbutils.fs.cp(dbfs_path, local_path)

endpoint = 'charts'

In [None]:
dbutils.fs.ls(f'dbfs:/datalake/deezer/user/{endpoint}/parquet')

[FileInfo(path='dbfs:/datalake/deezer/user/charts/parquet/user_charts_2023_09_29_02_29_58.parquet/', name='user_charts_2023_09_29_02_29_58.parquet/', size=0, modificationTime=0)]

In [None]:
# dbutils.fs.rm(f'dbfs:/datalake/deezer/user/{endpoint}/parquet/', recurse=True)

In [None]:
endpoints = ['history', 'charts']
# opção para automatização
# endpoints = dbutils.widgets.get("endpoints")

for endpoint in endpoints:
    nb_bf_running = dbutils.fs.ls(f'dbfs:/datalake/deezer/user/{endpoint}/parquet/')
    print(f'Number of files for {endpoint} endpoint before script: {len(nb_bf_running)}')
    
    info = ExtractInfo(
        endpoint=endpoint,
        )
    info.save()

    nb_af_running = dbutils.fs.ls(f'dbfs:/datalake/deezer/user/{endpoint}/parquet/')
    print(f'Number of files for {endpoint} endpoint after script: {len(nb_af_running)}')

In [None]:
# dbutils.fs.ls(f'dbfs:/datalake/deezer/user/{endpoint}/parquet/')

[FileInfo(path='dbfs:/datalake/deezer/user/charts/parquet/user_charts_2023_09_29_02_29_58.parquet/', name='user_charts_2023_09_29_02_29_58.parquet/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/datalake/deezer/user/charts/parquet/user_charts_2023_09_29_03_54_45.parquet/', name='user_charts_2023_09_29_03_54_45.parquet/', size=0, modificationTime=0)]

In [None]:
# reader = (
#     spark
#     .read
#     .format('parquet')
#     .option("recursiveFileLookup","true")
#     )
# 
# path_charts_files = f'/datalake/deezer/user/{endpoint}/parquet/'
# 
# sdf = reader.load(f'{path_charts_files}')
# sdf.createOrReplaceTempView(f'temp_df_{endpoint}')
# 
# config_sql = {
#     'history': {
#         'id_fields': ['`timestamp`'],
#         'order_by': '`timestamp` desc'
#     },
#     'charts': {
#         'id_fields': ['id', 'chart'],
#         'order_by': 'chart asc, `timestamp` desc'
#     }
# }
# 
# ids = ','.join(config_sql[endpoint]['id_fields'])
# order_by = config_sql[endpoint]['order_by']
# (spark.sql(f'''
#                SELECT 
#                 title, album.title, artist.name, chart,
#                 row_number() OVER (PARTITION BY {ids} ORDER BY `timestamp` DESC) as r_number
#                FROM temp_df_{endpoint}
#                QUALIFY row_number() OVER (PARTITION BY {ids} ORDER BY `timestamp` DESC) = 1
#                order by {order_by}
#                ''')
#  .display())

title,title.1,name,chart,r_number
Home,Barbie The Album,HAIM,1,1
Seasons,It’s The End Of The World But It’s A Beautiful Day,Thirty Seconds to Mars,2,1
Ahay,FEVER DREAM,Of Monsters And Men,3,1
Get Up Kid,It’s The End Of The World But It’s A Beautiful Day,Thirty Seconds to Mars,4,1
Stuck,It’s The End Of The World But It’s A Beautiful Day,Thirty Seconds to Mars,5,1
Life Is Beautiful,It’s The End Of The World But It’s A Beautiful Day,Thirty Seconds to Mars,6,1
Down,22 Break,Oh Wonder,7,1
22 Break,22 Break,Oh Wonder,8,1
Free,22 Break,Oh Wonder,9,1
Baby,22 Break,Oh Wonder,10,1


In [None]:
# to copy the whole folder you need to use recurse=True
# dbutils.fs.cp('file:/dbfs/datalake/deezer/user/history/parquet', 'dbfs:/datalake/deezer/user/history/parquet', recurse=True)

True