In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# !pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Spotify API
CLIENT_ID = "1656aba538ac46ec90faeee4214f0169"
CLIENT_SECRET = "7cf73c012bef4557a88b8d98717cabda"

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

def get_song_features(song_name, artist_name):
    try:
        # Search for songs
        query = f"track:{song_name} artist:{artist_name}"
        result = sp.search(q=query, type="track", limit=1)

        # examine the result
        if not result['tracks']['items']:
            return f"Did not find songs：{song_name} by {artist_name}"

        # get track id
        track_info = result['tracks']['items'][0]
        track_id = track_info['id']
        popularity = track_info['popularity']
        print(f"Find Songs：{result['tracks']['items'][0]['name']} by {result['tracks']['items'][0]['artists'][0]['name']}")

        # get features
        features = sp.audio_features([track_id])[0]
        if features:
            features['popularity'] = popularity
        return features
    except Exception as e:
        return f"Error：{e}"


In [None]:
import requests
from bs4 import BeautifulSoup

def get_top_songs(year):

    url = "https://kworb.net/spotify/songs_" + f"{year}.html"
    response = requests.get(url)

    if response.status_code != 200:
        print("Failed to retrieve data")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    song_list = []

    table = soup.find('table')
    rows = table.find('tbody').find_all('tr')

    for row in rows:
        columns = row.find_all('td')
        #print(columns)

        artist_title = columns[0].text.strip()
        streams = columns[1].text.strip()
        daily = columns[2].text.strip()

        song_list.append((artist_title, streams, daily))

    spotify_data = pd.DataFrame(song_list, columns=['Artist and Title', 'Streams', 'Daily Streams'])

    return spotify_data

In [None]:
def extract_features(row):
    features = row['feature']
    if isinstance(features, dict):
        # divide features into different columns
        for key, value in features.items():
            row[key] = value
    return row

In [None]:
# replaced 2021 with years 2018-2023
# to generate all of our data files
spotify_2021 = get_top_songs(2021)
spotify_2021

spotify_2021['Artist and Title'] = spotify_2021['Artist and Title'].str.split(' - ')
spotify_2021['artist_name'] = spotify_2021['Artist and Title'].apply(lambda x: x[0])
spotify_2021['song_name'] = spotify_2021['Artist and Title'].apply(lambda x: x[1])

spotify_2021['artist_name'] = spotify_2021['artist_name'].apply(lambda x: x.lower())
spotify_2021['song_name'] = spotify_2021['song_name'].apply(lambda x: x.lower())
spotify_2021

Unnamed: 0,Artist and Title,Streams,Daily Streams,artist_name,song_name
0,"[The Kid LAROI, STAY]",3347184330,1473805,the kid laroi,stay
1,"[Olivia Rodrigo, drivers license]",2325657574,929699,olivia rodrigo,drivers license
2,"[Olivia Rodrigo, good 4 u]",2298333269,835756,olivia rodrigo,good 4 u
3,"[Maroon 5, Memories]",2126410268,954444,maroon 5,memories
4,"[Lil Nas X, INDUSTRY BABY]",2088217488,471972,lil nas x,industry baby
...,...,...,...,...,...
995,"[OneRepublic, Wanted]",108557612,11980,onerepublic,wanted
996,"[Lovejoy, Sex Sells]",108539295,13061,lovejoy,sex sells
997,"[Electric Callboy, We Got the Moves]",108394895,89951,electric callboy,we got the moves
998,"[Morat, No Hay Más Que Hablar]",108239542,100634,morat,no hay más que hablar


In [None]:
# get features for each song
import time
batch_size = 10
for i in range(0, len(spotify_2021), batch_size):
    batch = spotify_2021.iloc[i:i+batch_size]
    batch['feature'] = batch[['song_name', 'artist_name']].apply(
        lambda x: get_song_features(x['song_name'], x['artist_name']), axis=1
    )
    # 将结果更新到主数据框
    spotify_2021.loc[batch.index, 'feature'] = batch['feature']
    time.sleep(1)  # 增加间隔以避免速率限制

# divided features into different columns
spotify_2021 = spotify_2021.apply(extract_features, axis=1)
spotify_2021.drop(columns=['feature'], inplace=True)

# data
spotify_2021

Find Songs：STAY (with Justin Bieber) by The Kid LAROI
Find Songs：drivers license by Olivia Rodrigo
Find Songs：good 4 u by Olivia Rodrigo
Find Songs：Memories by Maroon 5
Find Songs：INDUSTRY BABY (feat. Jack Harlow) by Lil Nas X
Find Songs：MONTERO (Call Me By Your Name) by Lil Nas X
Find Songs：Mood (feat. iann dior) by 24kGoldn
Find Songs：Kiss Me More (feat. SZA) by Doja Cat
Find Songs：Easy On Me by Adele
Find Songs：Bad Habits by Ed Sheeran


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Ghost by Justin Bieber
Find Songs：deja vu by Olivia Rodrigo
Find Songs：Woman by Doja Cat
Find Songs：traitor by Olivia Rodrigo
Find Songs：Shivers by Ed Sheeran
Find Songs：Peaches (feat. Daniel Caesar & Giveon) by Justin Bieber
Find Songs：Enemy (with JID) - from the series Arcane League of Legends by Imagine Dragons
Find Songs：Pepas by Farruko
Find Songs：Todo De Ti by Rauw Alejandro
Find Songs：Yonaguni by Bad Bunny


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Happier Than Ever by Billie Eilish
Find Songs：Leave The Door Open by Bruno Mars
Find Songs：Where Are You Now by Lost Frequencies
Find Songs：My Universe by Coldplay
Find Songs：Butter by BTS
Find Songs：happier by Olivia Rodrigo
Find Songs：MONEY by LISA
Find Songs：Freaks by Surf Curse
Find Songs：Need to Know by Doja Cat
Find Songs：favorite crime by Olivia Rodrigo


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：I WANNA BE YOUR SLAVE by Måneskin
Find Songs：Desesperados by Rauw Alejandro
Find Songs：abcdefu by GAYLE
Find Songs：Until I Found You by Stephen Sanchez
Find Songs：THATS WHAT I WANT by Lil Nas X
Find Songs：Friday (feat. Mufasa & Hypeman) - Dopamine Re-Edit by Riton
Find Songs：jealousy, jealousy by Olivia Rodrigo
Find Songs：Pray For Me by The Weeknd
Find Songs：RAPSTAR by Polo G
Find Songs：Lo Siento BB:/ by Tainy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：The Color Violet by Tory Lanez
Find Songs：WITHOUT YOU by The Kid LAROI
Find Songs：All Too Well (10 Minute Version) (Taylor's Version) (From The Vault) by Taylor Swift
Find Songs：2055 by Sleepy Hallow
Find Songs：Leave Before You Love Me (with Jonas Brothers) by Marshmello
Find Songs：You Right by Doja Cat
Find Songs：Fair Trade (with Travis Scott) by Drake
Find Songs：family ties (with Kendrick Lamar) by Baby Keem
Find Songs：The Motto by Tiësto
Find Songs：Love Story (Taylor’s Version) by Taylor Swift


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Pareja Del Año by Sebastian Yatra
Find Songs：Do It To It by ACRAZE
Find Songs：Qué Más Pues? by J Balvin
Find Songs：Wants and Needs (feat. Lil Baby) by Drake
Find Songs：Volví by Aventura
Find Songs：Calling My Phone by Lil Tjay
Find Songs：Wasted On You by Morgan Wallen
Find Songs：Wildest Dreams (Taylor's Version) by Taylor Swift
Find Songs：Miénteme by TINI
Find Songs：goosebumps by Travis Scott


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Hold On by Justin Bieber
Find Songs：Shout Out to My Ex by Little Mix
Find Songs：Knife Talk (with 21 Savage ft. Project Pat) by Drake
Find Songs：You Belong With Me (Taylor’s Version) by Taylor Swift
Find Songs：Beautiful Mistakes (feat. Megan Thee Stallion) by Maroon 5
Find Songs：Your Love (9PM) by ATB
Find Songs：Paradise by MEDUZA
Find Songs：Volando - Remix by Mora
Find Songs：Tiroteo - Remix by Marc Seguí
Find Songs：Pain by PinkPantheress


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Mon Amour - Remix by zzoilo
Find Songs：Toxic by BoyWithUke
Find Songs：Don't Be Shy by Tiësto
Find Songs：Cúrame by Rauw Alejandro
Find Songs：Anyone by Justin Bieber
Find Songs：Permission to Dance by BTS
Find Songs：Botella Tras Botella by Gera MX
Find Songs：METAMORPHOSIS by INTERWORLD
Find Songs：brutal by Olivia Rodrigo
Find Songs：Tacones Rojos by Sebastian Yatra


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Notion by The Rare Occasions
Find Songs：Up by Cardi B
Find Songs：Envolver by Anitta
Find Songs：One Right Now (with The Weeknd) by Post Malone
Find Songs：Get Into It (Yuh) by Doja Cat
Find Songs：Sobrio by Maluma
Find Songs：love nwantiti (ah ah ah) by CKay
Find Songs：Run by OneRepublic
Find Songs：Thunder by Gabry Ponte
Find Songs：Best Friend (feat. Doja Cat) by Saweetie


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Give It To Me - Full Vocal Mix by Matt Sassari
Find Songs：EL MAKINON by KAROL G
Find Songs：Fiel by Los Legendarios
Find Songs：Life Goes On by Oliver Tree
Find Songs：Oh My God by Adele
Find Songs：AM Remix by Nio Garcia
Find Songs：Way 2 Sexy (with Future & Young Thug) by Drake
Find Songs：Angel Baby by Troye Sivan
Find Songs：Por Las Noches by Peso Pluma
Find Songs：Miss The Rage by Trippie Redd


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Something About You by Eyedress
Find Songs：Reckless by Madison Beer
Find Songs：2/Catorce by Rauw Alejandro
Find Songs：LALISA by LISA
Find Songs：1 step forward, 3 steps back by Olivia Rodrigo
Find Songs：OUT OUT (feat. Charli XCX & Saweetie) by Joel Corry
Find Songs：París by Ingratax
Find Songs：Buy Dirt by Jordan Davis
Find Songs：We're Good by Dua Lipa
Find Songs：Nostálgico by Rvssian


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Tell Em by Cochise
Find Songs：Close Eyes by DVRST
Find Songs：this is what falling in love feels like by JVKE
Find Songs：512 by Mora
Find Songs：Don't Go Yet by Camila Cabello
Find Songs：Happier Than Ever by Billie Eilish
Find Songs：Rasputin by Majestic
Find Songs：Better Days (NEIKED x Mae Muller x Polo G) by NEIKED
Find Songs：edamame by bbno$
Find Songs：Heartbreak Anthem (with David Guetta & Little Mix) by Galantis


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：In Da Getto by J Balvin
Find Songs：ZITTI E BUONI by Måneskin
Find Songs：Entre Nosotros by Tiago PZK
Find Songs：Smokin Out The Window by Bruno Mars
Find Songs：enough for you by Olivia Rodrigo
Find Songs：BED by Joel Corry
Find Songs：Mónaco by LAGOS
Find Songs：Love Tonight (David Guetta Remix Edit) by Shouse
Find Songs：Remember (with David Guetta) by Becky Hill
Find Songs：Baila Conmigo (with Rauw Alejandro) by Selena Gomez


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Girls Want Girls (with Lil Baby) by Drake
Find Songs：I Hate U by SZA
Find Songs：Runtuh by Feby Putri
Find Songs：Praise God by Kanye West
Find Songs：On The Ground by ROSÉ
Find Songs：Tiago PZK: Bzrp Music Sessions, Vol. 48 by Bizarrap
Find Songs：Sunshine by OneRepublic
Find Songs：Take My Breath by The Weeknd
Find Songs：Medallo by Blessd
Find Songs：Acapulco by Jason Derulo


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Build a Bitch by Bella Poarch
Find Songs：m y . l i f e (with 21 Savage & Morray) by J. Cole
Find Songs：A La Antigüita by Calibre 50
Find Songs：Cover Me In Sunshine by P!nk
Find Songs：Wellerman - Sea Shanty by Nathan Evans
Find Songs：SAD GIRLZ LUV MONEY Remix (feat. Kali Uchis and Moliy) by Amaarae
Find Songs：The Feels by TWICE
Find Songs：The Perfect Girl by Mareux
Find Songs：Here's Your Perfect by Jamie Miller
Find Songs：The Way I Loved You (Taylor’s Version) by Taylor Swift


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Ghost Town by Benson Boone
Find Songs：Bang! by AJR
Find Songs：Your Power by Billie Eilish
Find Songs：I GUESS I'M IN LOVE by Clinton Kane
Find Songs：SO DONE by The Kid LAROI
Find Songs：Follow You by Imagine Dragons
Find Songs：Wellerman - Sea Shanty by Nathan Evans
Find Songs：Ley Seca by JHAYCO
Find Songs：People Watching by Conan Gray
Find Songs：Loco by Justin Quiles


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Broadway Girls (feat. Morgan Wallen) by Lil Durk
Find Songs：Every Summertime by NIKI
Find Songs：Hurricane by Kanye West
Find Songs：Fall in Love with You. by Montell Fish
Find Songs：Sand In My Boots by Morgan Wallen
Find Songs：Run It Up (feat. Offset & Moneybagg Yo) by Lil Tjay
Find Songs：Motley Crew by Post Malone
Find Songs：Wasted Love (feat. Lagique) by Ofenbach
Find Songs：Monëy so big by Yeat
Find Songs：AOK by Tai Verdes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Summer of Love (Shawn Mendes & Tainy) by Shawn Mendes
Find Songs：I Knew You Were Trouble (Taylor's Version) by Taylor Swift
Find Songs：Little Bit of Love by Tom Grennan
Find Songs：Yebba’s Heartbreak by Drake
Find Songs：Touch by Little Mix
Find Songs：Move Your Body by Öwnboss
Find Songs：Agua (with J Balvin) by Tainy
Find Songs：Mr. Perfectly Fine (Taylor’s Version) (From The Vault) by Taylor Swift
Find Songs：What’s Next by Drake
Find Songs：LOT OF ME by Lil Tecca


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Marea (we’ve lost dancing) by Fred again..
Find Songs：Royalty by Egzod
Find Songs：We Are Never Ever Getting Back Together (Taylor's Version) by Taylor Swift
Find Songs：ONLY by LeeHi
Find Songs：p r i d e . i s . t h e . d e v i l (with Lil Baby) by J. Cole


Max Retries reached


Find Songs：200 COPAS by KAROL G


Max Retries reached


Find Songs：Higher Power by Coldplay


Max Retries reached


Find Songs：Ella No Es Tuya - Remix by Rochy RD


Max Retries reached


Find Songs：Ya Supérame (En Vivo) by Grupo Firme


Max Retries reached


Find Songs：EVERY CHANCE I GET (feat. Lil Baby & Lil Durk) by DJ Khaled


Max Retries reached
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['feature'] = batch[['song_name', 'artist_name']].apply(


Find Songs：Arranhão - Ao Vivo by Henrique & Juliano


Max Retries reached


Find Songs：NDA by Billie Eilish


Max Retries reached


Find Songs：right where you left me - bonus track by Taylor Swift


Max Retries reached


Find Songs：World's Smallest Violin by AJR


Max Retries reached


Find Songs：Slumber Party (feat. Princess Nokia) by Ashnikko


Max Retries reached


Find Songs：WUSYANAME (feat. Youngboy Never Broke Again & Ty Dolla $ign) by Tyler, The Creator


KeyboardInterrupt: 

In [None]:
print(len(spotify_2021))
spotify_2021.info()
spotify_2021.to_csv('spotify_2021.csv')

941
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Artist and Title  941 non-null    object 
 1   Daily Streams     941 non-null    object 
 2   Streams           941 non-null    object 
 3   acousticness      939 non-null    float64
 4   analysis_url      939 non-null    object 
 5   artist_name       941 non-null    object 
 6   danceability      939 non-null    float64
 7   duration_ms       939 non-null    float64
 8   energy            939 non-null    float64
 9   id                939 non-null    object 
 10  instrumentalness  939 non-null    float64
 11  key               939 non-null    float64
 12  liveness          939 non-null    float64
 13  loudness          939 non-null    float64
 14  mode              939 non-null    float64
 15  popularity        939 non-null    float64
 16  song_name         941 non-null    object