In [1]:
import os
import pandas as pd
import numpy as np 
import datetime as dt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from api_credentials import client_id, client_secret, redirect_uri
from playlist_membership_check import get_tracks_positions_in_playlists
import json

# Load playlists from a JSON file
with open('playlists.json', 'r') as file:
    playlists_dict = json.load(file)


# Initialize the Spotify client with client credentials for public data access
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Read in the AU Daily 200 Spotify Chart  
Manually downloaded CSV from https://charts.spotify.com/charts/view/regional-au-daily/latest

In [2]:
# Load the CSV file
file_path = 'raw_data/regional-au-daily-2024-02-09.csv'
df = pd.read_csv(file_path)

# Extract the date from the file name
file_name = os.path.basename(file_path)  # Gets 'regional-au-daily-2024-02-05.csv'
date_str = file_name.split('-')[-3:]  # Splits the file name and takes the last 3 elements ['2024', '02', '05.csv']
date_str = '-'.join(date_str)  # Joins them back into '2024-02-05.csv'
date = date_str.split('.')[0]  # Removes the file extension, resulting in '2024-02-05'

# Convert the date string to a datetime object
date_datetime = pd.to_datetime(date)

# Format the datetime object to a string in 'DD-MM-YYYY' format
formatted_date = date_datetime.strftime('%d-%m-%Y')

# Add the formatted date as a new column to the DataFrame
df['date'] = formatted_date

Extract each tracks unique id for API call 

In [3]:
track_ids = df['uri'].str[14:]
df['track_id'] = df['uri'].str[14:]
df.drop(columns='uri', inplace=True)

Call Spotify API to retrieve `popularity_score` for each of the top 200 tracks

In [4]:
# Assuming `track_ids` is your Pandas Series of track IDs as shown
# Convert the Series to a list for processing
track_ids_list = track_ids.tolist()

# Spotify's `tracks` endpoint can take multiple track IDs at once,
# but to avoid potential rate limits or payload size issues,
# it's a good idea to process them in batches.
# Here, we define a batch size (e.g., 50 tracks per request)
batch_size = 50

# Initialize lists to store the retrieved data
artists = []
track_names = []
popularity_scores = []

for i in range(0, len(track_ids_list), batch_size):
    batch = track_ids_list[i:i+batch_size]
    tracks_info = sp.tracks(batch)
    for track in tracks_info['tracks']:
        # Some tracks might not be available, leading to `None` entries
        if track:
            # Extracting the primary artist name
            artist_name = track['artists'][0]['name'] if track['artists'] else 'N/A'
            artists.append(artist_name)
            
            # Extracting the track name
            track_names.append(track['name'])
            
            # Extracting the popularity score
            popularity_scores.append(track['popularity'])

# Now that you have all the data, you can put it into a DataFrame
popularity_df = pd.DataFrame({
    'artist_name': artists,
    'track_name': track_names,
    'popularity_score': popularity_scores
})

# Show the DataFrame to verify the results
popularity_df.head()


Unnamed: 0,artist_name,track_name,popularity_score
0,Noah Kahan,Stick Season,98
1,Benson Boone,Beautiful Things,95
2,Jack Harlow,Lovin On Me,99
3,Taylor Swift,Cruel Summer,99
4,cassö,Prada,94


In [5]:
df['popularity_score'] = popularity_df['popularity_score']

In [6]:
# Reorder columns
df = df[['date', 'rank', 'artist_names', 'track_name', 'track_id', 'source', 'peak_rank', 'previous_rank', 'days_on_chart', 'streams', 'popularity_score']]

# Rename columns
df.rename(columns={
    'source': 'label',
    'streams': 'daily_streams',
}, inplace=True)

In [7]:
df.head()

Unnamed: 0,date,rank,artist_names,track_name,track_id,label,peak_rank,previous_rank,days_on_chart,daily_streams,popularity_score
0,09-02-2024,1,Noah Kahan,Stick Season,0mflMxspEfB0VbI1kyLiAv,Mercury Records/Republic Records,1,1,127,338146,98
1,09-02-2024,2,Benson Boone,Beautiful Things,6tNQ70jh4OwmPGpYy6R2o9,"Night Street Records, Inc./Warner Records Inc.",2,3,22,291959,95
2,09-02-2024,3,Jack Harlow,Lovin On Me,4xhsWYTOGcal8zt0J161CU,Generation Now/Atlantic,1,2,92,285470,99
3,09-02-2024,4,Taylor Swift,Cruel Summer,1BxfuPKGuaTgP7aM0Bbdwr,Taylor Swift,1,5,370,246666,99
4,09-02-2024,5,"cassö, RAYE, D-Block Europe",Prada,59NraMJsLaMCVtwXTSia8i,Ministry of Sound Recordings,4,6,181,227080,94


In [8]:
df['previous_rank'].unique()

array([  1,   3,   2,   5,   6,   4,   7,   8,   9,  11,  10,  14,  12,
        19,  18,  17,  13,  15,  20,  21,  22,  25,  23,  16,  24,  29,
        27,  28,  31,  26,  38,  32,  35,  33,  34,  30,  36,  45,  43,
        40,  42,  44,  47,  39,  54,  41,  37,  49,  51,  56,  53,  50,
        60,  55,  -1,  48,  57,  58,  59,  62,  52,  65,  46,  61,  67,
        83,  64,  72,  69,  70,  79,  77,  80,  63,  86,  76,  82,  88,
       105,  66,  94,  90,  73,  75,  89,  81,  97,  78,  68,  71,  91,
        92,  96,  99,  85, 101,  95, 114, 100, 102, 117,  84,  93,  98,
       113,  87, 106, 131, 122, 111, 128, 109, 115, 118, 137, 124, 130,
       133, 112, 147, 119, 110, 163, 120, 108, 107, 126, 148, 156,  74,
       127, 134, 135, 159, 140, 103, 121, 136, 142, 183, 162, 150, 132,
       149, 125, 155, 145, 154, 144, 153, 179, 174, 138, 186, 146, 157,
       181, 151, 171, 139, 141, 143, 167, 178, 104, 165, 200, 173, 184,
       175, 158, 187, 129, 193, 176, 164, 189, 152, 161, 185, 19

Retrieve additional data from Kworb.net
This will give us additional features/columns like:  
 - `count_at_peak`
 - `rank_movement`
 - `streams+`
 - `7day_streams`
 - `7day_streams_movement` 
 - `total_streams`

In [9]:
# Read the HTML table into a DataFrame
kworb_df = pd.read_html('https://kworb.net/spotify/country/au_daily.html')[0]

# Get the current date minus 2 days to account for data lag
current_date_minus_2 = dt.datetime.now().date() - dt.timedelta(days=2)

# Add the current date as a new column to the DataFrame
kworb_df['DATE'] = current_date_minus_2

In [10]:
kworb_df = kworb_df.rename(columns={
    'Pos' : 'rank',
    'P+': 'rank_change',
    'Artist and Title' : 'artist_title',
    'Days' : 'days_on_chart',
    'Pk' : 'peak_rank',
    '(x?)' : 'days_at_peak',
    'Streams' : 'daily_streams',
    'Streams+' : 'daily_streams_change',
    '7Day' : '7Day_streams',
    '7Day+' : '7Day_streams_change',
    'Total' : 'total_streams',
})

In [11]:
# Selecting specific columns to view
selected_columns_df = kworb_df[['artist_title', 'rank_change', 'days_at_peak', 'daily_streams_change', '7Day_streams', '7Day_streams_change', 'total_streams']]

# Display the DataFrame with the selected columns
selected_columns_df['rank_change'].unique()


array(['=', '+1', '-1', '-2', '+2', '+5', '+3', '-4', '-3', '-8', '+7',
       '-6', '+4', '-5', '+9', '-10', '+6', 'NEW', '-9', '-17', '+17',
       '+8', '-11', '+11', '+10', '+26', '-14', '+13', '-22', '-20',
       '+15', '-19', '-7', '+22', '+12', '+16', '+21', '+14', '-13',
       '+39', '-18', '+19', '-57', '+24', '-34', '+42', '+20', '-12',
       '-21', 'RE', '+30', '-24', '-23', '-64', '-16', '-47', '-31',
       '-30'], dtype=object)

Need to change "=" to "0", remove "+", and handle "RE"

In [12]:
# Remove "+" and ensure "=" is replaced with "0"
kworb_df['rank_change'] = kworb_df['rank_change'].str.replace('+', '').str.replace('=', '0', regex=False)

# Attempt to convert the column to numeric without coercing errors to identify potential non-numeric values
try:
    kworb_df['rank_change'] = pd.to_numeric(kworb_df['rank_change'])
    print("Conversion successful. No NaN values detected.")
except ValueError as e:
    print("Conversion failed due to non-numeric values:", e)

# Optionally, inspect the column for any remaining non-numeric values or anomalies
# This step is just for verification and won't change the DataFrame
non_numeric = kworb_df[pd.to_numeric(kworb_df['rank_change'], errors='coerce').isna()]
if not non_numeric.empty:
    print("Non-numeric values found:\n", non_numeric)
else:
    print("No non-numeric values found.")

Conversion failed due to non-numeric values: Unable to parse string "NEW" at position 54
Non-numeric values found:
      rank rank_change                                       artist_title  \
54     55         NEW                               Noah Kahan - Forever   
82     83         NEW  ¥$ - Talking / Once Again (w/ Kanye West, Ty D...   
148   149          RE                            P!nk - Raise Your Glass   
156   157          RE                   Carly Rae Jepsen - Call Me Maybe   
178   179          RE                          3 Doors Down - Kryptonite   
181   182          RE                     Van Morrison - Brown Eyed Girl   
185   186          RE                               50 Cent - In Da Club   
187   188          RE            JAY-Z - Ni**as In Paris (w/ Kanye West)   
188   189          RE                          Olivia Rodrigo - good 4 u   
190   191          RE                        Taylor Swift - Shake It Off   
193   194          RE                           

  kworb_df['rank_change'] = kworb_df['rank_change'].str.replace('+', '').str.replace('=', '0', regex=False)


In [13]:
# Extract numeric values from 'COUNT_AT_PEAK'
kworb_df['days_at_peak'] = kworb_df['days_at_peak'].str.extract('(\d+)', expand=False)

# Convert to numeric, allowing NaNs to remain
kworb_df['days_at_peak'] = pd.to_numeric(kworb_df['days_at_peak'], errors='coerce')

In [14]:
columns_to_keep = kworb_df[['artist_title', 'rank_change', 'days_at_peak', 'daily_streams_change', '7Day_streams', '7Day_streams_change', 'total_streams']]
columns_to_keep.head(3)

Unnamed: 0,artist_title,rank_change,days_at_peak,daily_streams_change,7Day_streams,7Day_streams_change,total_streams
0,Noah Kahan - Stick Season,0,14.0,-986.0,2337327,-4852,22368505
1,Benson Boone - Beautiful Things,1,3.0,23745.0,1872625,52885,4150846
2,Jack Harlow - Lovin On Me,-1,72.0,796.0,1973085,-18635,29349292


In [15]:
print(f"Spotify Daily 200 columns:", df.columns.tolist())
print(f"Kworb Daily 200 columns:", kworb_df.columns.tolist())

Spotify Daily 200 columns: ['date', 'rank', 'artist_names', 'track_name', 'track_id', 'label', 'peak_rank', 'previous_rank', 'days_on_chart', 'daily_streams', 'popularity_score']
Kworb Daily 200 columns: ['rank', 'rank_change', 'artist_title', 'days_on_chart', 'peak_rank', 'days_at_peak', 'daily_streams', 'daily_streams_change', '7Day_streams', '7Day_streams_change', 'total_streams', 'DATE']


In [16]:
df['rank_change'] = kworb_df['rank_change']
df['days_at_peak'] = kworb_df['days_at_peak']
df['daily_streams_change'] = kworb_df['daily_streams_change']
df['7Day_streams'] = kworb_df['7Day_streams']
df['7Day_streams_change'] = kworb_df['7Day_streams_change']
df['total_streams'] = kworb_df['total_streams']


In [17]:
columns_order =  [
    'date',
    'artist_names',
    'track_name',
    'rank',
    'previous_rank',
    'rank_change',
    'peak_rank',
    'days_at_peak',
    'days_on_chart',
    'popularity_score',
    'daily_streams',
    'daily_streams_change',
    '7Day_streams',
    '7Day_streams_change',
    'total_streams',
    'label',
    'track_id',
]

df = df[columns_order]

In [18]:
df.head()

Unnamed: 0,date,artist_names,track_name,rank,previous_rank,rank_change,peak_rank,days_at_peak,days_on_chart,popularity_score,daily_streams,daily_streams_change,7Day_streams,7Day_streams_change,total_streams,label,track_id
0,09-02-2024,Noah Kahan,Stick Season,1,1,0,1,14.0,127,98,338146,-986.0,2337327,-4852,22368505,Mercury Records/Republic Records,0mflMxspEfB0VbI1kyLiAv
1,09-02-2024,Benson Boone,Beautiful Things,2,3,1,2,3.0,22,95,291959,23745.0,1872625,52885,4150846,"Night Street Records, Inc./Warner Records Inc.",6tNQ70jh4OwmPGpYy6R2o9
2,09-02-2024,Jack Harlow,Lovin On Me,3,2,-1,1,72.0,92,99,285470,796.0,1973085,-18635,29349292,Generation Now/Atlantic,4xhsWYTOGcal8zt0J161CU
3,09-02-2024,Taylor Swift,Cruel Summer,4,5,1,1,1.0,370,99,246666,10564.0,1626122,21243,52373112,Taylor Swift,1BxfuPKGuaTgP7aM0Bbdwr
4,09-02-2024,"cassö, RAYE, D-Block Europe",Prada,5,6,1,4,,181,94,227080,14223.0,1490515,-3643,29137187,Ministry of Sound Recordings,59NraMJsLaMCVtwXTSia8i


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  200 non-null    object 
 1   artist_names          200 non-null    object 
 2   track_name            200 non-null    object 
 3   rank                  200 non-null    int64  
 4   previous_rank         200 non-null    int64  
 5   rank_change           200 non-null    object 
 6   peak_rank             200 non-null    int64  
 7   days_at_peak          44 non-null     float64
 8   days_on_chart         200 non-null    int64  
 9   popularity_score      200 non-null    int64  
 10  daily_streams         200 non-null    int64  
 11  daily_streams_change  185 non-null    float64
 12  7Day_streams          200 non-null    int64  
 13  7Day_streams_change   200 non-null    int64  
 14  total_streams         200 non-null    int64  
 15  label                 2

In [21]:
yesterday_df = pd.read_csv('combined_data/combined.csv')
yesterday_df.tail()

Unnamed: 0,date,artist_names,track_name,rank,previous_rank,rank_change,peak_rank,days_at_peak,days_on_chart,popularity_score,daily_streams,daily_streams_change,7Day_streams,7Day_streams_change,total_streams,label,track_id
795,08-02-2024,Ed Sheeran,Shivers,196,183,-13,4,,877,65,49087,-713.0,342125,-1088,81744453,Atlantic Records UK,3xWGA8pa0IKFI7IMPri4P0
796,08-02-2024,"benny blanco, Halsey, Khalid",Eastside (with Halsey & Khalid),197,156,-41,1,22.0,1437,82,48969,-3982.0,299949,48969,109206018,Friends Keep Secrets/Interscope Records,7FGq80cy8juXBCD2nrqdWU
797,08-02-2024,Taylor Swift,Blank Space,198,178,-20,7,,776,88,48968,-1149.0,191081,48968,35304293,"Big Machine Records, LLC",1u8c2t2Cy7UBoG4ArRcF5g
798,08-02-2024,5 Seconds of Summer,Youngblood,199,165,-34,1,41.0,1746,85,48955,-2834.0,296179,-1791,117222507,Capitol,2iUXsYOEPhVqEBwsqP70rE
799,08-02-2024,The Beatles,Here Comes The Sun - Remastered 2009,200,163,-37,111,,63,87,48655,-3391.0,297275,-96,2670238,EMI Catalogue,6dGnYIeXmHdcikdzNNDMm2


In [22]:
combined_df = pd.concat([yesterday_df, df], ignore_index=True)

In [23]:
print(len(combined_df))

1000


In [24]:
combined_df.to_csv('combined_data/combined.csv', index=False)

## Example usage of `get_tracks_position_in_playlist`

Example: Checking which playlists and positions Noah Kahan's 'Stick Season' is in

In [25]:
# Example: Checking playlists and positoons for Noah Kahan 'Stick Season'
# using it's track_id
track_ids = [
    '0mflMxspEfB0VbI1kyLiAv', 
    # ... add other track IDs
]

track_positions = get_tracks_positions_in_playlists(client_id, client_secret, redirect_uri, playlists_dict, track_ids)

for track_id, playlists in track_positions.items():
    print(f"Track ID {track_id} is in the following playlists at these positions:")
    for playlist_name, positions in playlists.items():
        positions_str = ", ".join(str(position + 1) for position in positions)  # Create a string of positions, accounting for 0 indexing
        print(f"Playlist: {playlist_name}: Position #{positions_str}")
    if not playlists:
        print("This track is not in any of the given playlists.")

Track ID 0mflMxspEfB0VbI1kyLiAv is in the following playlists at these positions:
Playlist: Hot Hits Australia: Position #5
Playlist: Today's Top Hits: Position #12
