In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests


In [None]:
# # load the data
# df_combined = pd.read_csv('spotify_combined.csv')

In [None]:
LASTFM_API_KEY = 'da24d2ff611406c4ec92042b31b37f9a'

def get_genres_from_lastfm(song_data: pd.DataFrame) -> dict:
    """
    批量获取 Last.fm 的流派信息。

    Parameters:
        song_data (DataFrame): 包含 'track_name' 和 'artist_name' 列的 DataFrame。

    Returns:
        dict: 歌曲名称与流派的映射字典。
    """
    genres_dict = {}

    for _, row in tqdm(song_data.iterrows(), total=len(song_data), desc="Fetching genres from Last.fm"):
        track_name = row['track_name']
        artist_name = row['artist_name']
        try:
            # 调用 Last.fm API
            url = f"http://ws.audioscrobbler.com/2.0/"
            params = {
                'method': 'track.getInfo',
                'api_key': LASTFM_API_KEY,
                'track': track_name,
                'artist': artist_name,
                'format': 'json'
            }
            response = requests.get(url, params=params)
            data = response.json()

            # 提取流派信息
            if 'track' in data and 'toptags' in data['track']:
                tags = data['track']['toptags']['tag']
                genres = [tag['name'] for tag in tags]
                genres_dict[track_name] = genres if genres else ["Unknown"]
            else:
                genres_dict[track_name] = ["Unknown"]
        except Exception as e:
            print(f"Error fetching genre for {track_name}: {e}")
            genres_dict[track_name] = ["Unknown"]

    return genres_dict


In [None]:
# # 示例：调用函数获取流派
# track_genres_lastfm = get_genres_from_lastfm(df_combined[['track_name', 'artist_name']])
#
# # 添加到 DataFrame
# df_combined['genres'] = df_combined['track_name'].map(track_genres_lastfm)

Fetching genres from Last.fm:  23%|██▎       | 414/1820 [00:54<05:33,  4.21it/s]

Error fetching genre for boy's a liar: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


Fetching genres from Last.fm: 100%|██████████| 1820/1820 [04:14<00:00,  7.14it/s]


In [None]:
# df_combined.dropna()
# df_combined.to_csv('spotify_combined_with_genre.csv', index=False)

In [None]:
df_combined = pd.read_csv('spotify_combined_with_genre.csv')

In [None]:
all_genres = ['electronic', 'hip hop', 'pop', 'rock', 'jazz', 'classical', 'folk', 'metal', 'blues', 'country', 'reggae', 'latin', 'rap', 'indie', 'punk', 'funk', 'soul', 'dance', 'r&b', 'disco', 'k-pop', 'house', 'experimental', 'afrobeats', 'synthpop', 'alternative', 'soundtrack', 'power pop', 'dream pop', 'brazilian', 'emo', 'acoustic', 'british', 'american', 'party', 'urban']

In [None]:
def add_top_genres(df: pd.DataFrame, genres_column: str) -> pd.DataFrame:
    """
    Add top N genres as binary columns to the DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a column containing lists of genres.
        genres_column (str): Name of the column in the DataFrame containing genre lists.
        top_n (int): Number of top genres to include as binary columns.

    Returns:
        pd.DataFrame: DataFrame with top N genre columns added.
    """
    # Extract all genres into a single list and count occurrences
    all_genres = ['electronic', 'hip hop', 'pop', 'rock', 'jazz', 'classical', 'folk', 'metal', 'blues', 'country', 'reggae', 'latin', 'rap', 'indie', 'punk', 'funk', 'soul', 'dance', 'r&b', 'disco', 'k-pop', 'house', 'experimental', 'afrobeats', 'synthpop', 'alternative', 'soundtrack', 'power pop', 'dream pop', 'brazilian', 'emo', 'acoustic', 'british', 'american', 'party', 'urban']

    # Add binary columns for each top genre
    for genre in all_genres:
        df[genre] = df[genres_column].apply(lambda x: 1 if genre in x else 0)

    return df


In [None]:
df_genres = add_top_genres(df_combined, 'genres')
df_genres

Unnamed: 0,id,artist_name,track_name,daily_streams,streams,acousticness,danceability,duration_ms,energy,instrumentalness,...,soundtrack,power pop,dream pop,brazilian,emo,acoustic,british,american,party,urban
0,4Dvkj6JhhA12EX05fT7y2e,harry styles,as it was,1618590.0,3633339736,0.34200,0.520,167303.0,0.731,0.001010,...,0,0,0,0,0,0,1,0,0,0
1,1Qrg8KqiBpW07V7PNxwwwL,sza,kill bill,1918330.0,2186838653,0.05210,0.644,153947.0,0.735,0.144000,...,0,0,0,0,0,0,0,0,0,0
2,3tt9i3Hhzq84dPS8H7iSiJ,manuel turizo,la bachata,1040570.0,1980184177,0.58300,0.835,162638.0,0.679,0.000002,...,0,0,0,0,0,0,0,0,0,0
3,6Sq7ltF9Qa7SNFBsV5Cogx,bad bunny,me porto bonito,730921.0,1943322630,0.09010,0.911,178567.0,0.712,0.000027,...,0,0,0,0,0,0,0,0,0,0
4,4uUG5RXrOk84mYEfFvj3cK,david guetta,i’m good (blue),1387697.0,1889030676,0.00383,0.561,175238.0,0.965,0.000007,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1815,5WV6br0BPUzhJwyBty80s0,juliette,tengo,17825.0,101046577,0.25000,0.729,129692.0,0.855,0.000000,...,0,0,0,0,0,0,0,0,0,0
1816,6xbi77Vq9RALnKc59kSXIS,guyon waton,sanes,122809.0,100985019,0.17600,0.610,301846.0,0.635,0.000002,...,0,0,0,0,0,0,0,0,0,0
1817,5S8zFPQ3KIlF2y9Kt8mRhg,lucky brown,¿cuál es su nombre?,81567.0,100794660,0.04880,0.815,183400.0,0.733,0.000002,...,0,0,0,0,0,0,0,0,0,0
1818,0b83L1ToHRiwlpmUAjiCHc,daniel caesar,valentina,204546.0,100490736,0.62200,0.754,154532.0,0.311,0.336000,...,0,0,0,0,0,0,0,0,0,0


36