## Importando bibliotecas

In [22]:
import os
import zipfile
import pandas as pd
import kaggle
import duckdb

## Download do dataset

* Download do arquivo em zip
* Lendo arquivos csv sem extração
* Criando dataframe a partir do csv
* Configuração da coluna de data para formato datetime

In [29]:
dataset_folder = 'datasets'
dataset_owner = 'asaniczka'
dataset_name = 'top-spotify-songs-in-73-countries-daily-updated'
dataset_download_command = f"{dataset_owner}/{dataset_name}"
dataset_type = '.zip'
dataset_path = os.path.join(dataset_folder, dataset_name + dataset_type)

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

if os.path.exists(dataset_path):
    os.remove(dataset_path)

!kaggle datasets download -d {dataset_download} -p {dataset_folder}

if os.path.exists(dataset_path):
    print(f"success: {dataset_path}")
else:
    print("error")

with zipfile.ZipFile(dataset_path, 'r') as dataset_ref:
    dataset_ref.printdir()

    for dataset_name in dataset_ref.namelist():
        if dataset_name.endswith('.csv'):
            print(f"Filename: {dataset_name}")            
            with dataset_ref.open(dataset_name) as dataset:
                df = pd.read_csv(dataset)

df['snapshot_date'] = pd.to_datetime(df['snapshot_date'], errors='coerce')


  0%|          | 0.00/113M [00:00<?, ?B/s]
  1%|          | 1.00M/113M [00:00<01:21, 1.45MB/s]
  2%|▏         | 2.00M/113M [00:00<00:40, 2.85MB/s]
  4%|▎         | 4.00M/113M [00:00<00:19, 6.00MB/s]
  6%|▌         | 7.00M/113M [00:01<00:10, 11.1MB/s]
 10%|▉         | 11.0M/113M [00:01<00:06, 17.2MB/s]
 12%|█▏        | 14.0M/113M [00:01<00:05, 20.1MB/s]
 16%|█▌        | 18.0M/113M [00:01<00:04, 24.8MB/s]
 19%|█▉        | 22.0M/113M [00:01<00:03, 27.7MB/s]
 23%|██▎       | 26.0M/113M [00:01<00:03, 29.4MB/s]
 27%|██▋       | 30.0M/113M [00:01<00:02, 31.1MB/s]
 30%|███       | 34.0M/113M [00:01<00:02, 31.7MB/s]
 34%|███▎      | 38.0M/113M [00:02<00:02, 32.7MB/s]
 37%|███▋      | 42.0M/113M [00:02<00:02, 32.6MB/s]
 41%|████      | 46.0M/113M [00:02<00:02, 32.8MB/s]
 44%|████▍     | 50.0M/113M [00:02<00:01, 33.5MB/s]
 48%|████▊     | 54.0M/113M [00:02<00:01, 33.4MB/s]
 51%|█████▏    | 58.0M/113M [00:02<00:01, 33.7MB/s]
 55%|█████▍    | 62.0M/113M [00:02<00:01, 32.5MB/s]
 58%|█████▊    | 66.

Dataset URL: https://www.kaggle.com/datasets/asaniczka/top-spotify-songs-in-73-countries-daily-updated
License(s): ODC Attribution License (ODC-By)
Downloading top-spotify-songs-in-73-countries-daily-updated.zip to datasets

success: datasets\top-spotify-songs-in-73-countries-daily-updated.zip
File Name                                             Modified             Size
universal_top_spotify_songs.csv                2024-11-28 15:10:04    346764771
Filename: universal_top_spotify_songs.csv


In [30]:
df.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5vNRhkKd0yEAg8suGBpjeY,APT.,"ROSÉ, Bruno Mars",1,0,1,,2024-11-27,98,False,...,0,-4.477,0,0.26,0.0283,0.0,0.355,0.939,149.027,4
1,2plbrEY59IikOBgBGLjaoe,Die With A Smile,"Lady Gaga, Bruno Mars",2,0,-1,,2024-11-27,100,False,...,6,-7.777,0,0.0304,0.308,0.0,0.122,0.535,157.969,3
2,2CGNAOSuO1MEFCbBRgUzjd,luther (with sza),"Kendrick Lamar, SZA",3,0,47,,2024-11-27,64,False,...,2,-7.546,1,0.125,0.251,0.0,0.248,0.576,138.008,4
3,0nj9Bq5sHDiTxSHunhgkFb,squabble up,Kendrick Lamar,4,0,46,,2024-11-27,86,True,...,0,-5.568,1,0.198,0.0206,0.0,0.0783,0.711,103.921,4
4,0aB0v4027ukVziUGwVGYpG,tv off (feat. lefty gunplay),"Kendrick Lamar, Lefty Gunplay",5,1,45,,2024-11-27,85,True,...,6,-6.679,0,0.263,0.0837,0.0,0.423,0.548,100.036,4


In [31]:
print('Colunas do dataframe: ', df.columns)

Colunas do dataframe:  Index(['spotify_id', 'name', 'artists', 'daily_rank', 'daily_movement',
       'weekly_movement', 'country', 'snapshot_date', 'popularity',
       'is_explicit', 'duration_ms', 'album_name', 'album_release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature'],
      dtype='object')


In [33]:
print('Número de colunas: ', df.shape[1])
print('Número de registros: ', df.shape[0])
print('Última data de atualização: ', df['snapshot_date'].max().date())

Número de colunas:  25
Número de registros:  1468130
Última data de atualização:  2024-11-27


## Explorando o dataset

- Selecionando dados referentes aos álbuns indicados a Álbum do Ano, nos Estados Unidos em 2024;
- Criando coluna de artista principal (main_artist);

In [34]:
aoty_data = df[(df['album_name'].isin(["New Blue Sun", 
                                      "COWBOY CARTER", 
                                      "Short n' Sweet", 
                                      "BRAT", 
                                      "Djesse Vol.4", 
                                      "HIT ME HARD AND SOFT", 
                                      "The Rise and Fall of a Midwest Princess",
                                      "THE TORTURED POETS DEPARTMENT"])) &
            (df['country'] == 'US') &
            (df['snapshot_date'].dt.year == 2024)].copy()

aoty_data.loc[:, 'main_artist'] = aoty_data['artists'].apply(lambda x: x.split(',')[0])

In [55]:
# Listagem dos artistas

aoty_artists = sorted(list(set(aoty_data['main_artist'])))
aoty_artists

['Beyoncé',
 'Billie Eilish',
 'Chappell Roan',
 'Charli xcx',
 'Sabrina Carpenter',
 'Taylor Swift']

In [57]:
print(f'Número de aparições no Top50 em 2024 (última atualização: {aoty_data['snapshot_date'].max().date()})\n')

for artist in aoty_artists:
    count = aoty_data[aoty_data['main_artist'] == artist].shape[0]
    print(f'{artist}: {count}')

Número de aparições no Top50 em 2024 (última atualização: 2024-11-27)

Beyoncé: 136
Billie Eilish: 593
Chappell Roan: 570
Charli xcx: 92
Sabrina Carpenter: 567
Taylor Swift: 537
