In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Read the HTML table into a DataFrame
df = pd.read_html('https://kworb.net/spotify/country/au_daily.html')[0]

# Get the current date minus 2 days to account for data lag
current_date_minus_2 = dt.datetime.now().date() - dt.timedelta(days=2)

# Add the current date as a new column to the DataFrame
df['DATE'] = current_date_minus_2

# Set the 'date' column as the index of the DataFrame and apply changes in place
df.set_index('DATE', inplace=True)

df = df.rename(columns={
    'Pos' : 'SPOTIFY_POS',
    'P+': 'SPOTIFY_MOVEMENT',
    'Artist and Title' : 'ARTIST_TITLE',
    'Days' : 'DAYS_IN_CHART',
    'Pk' : 'SPOTIFY_PEAK',
    '(x?)' : 'COUNT_AT_PEAK',
    'Streams' : 'SPOTIFY_DAILY_STREAMS',
    'Streams+' : 'SPOTIFY_DAILY_STREAMS_MOVEMENT',
    '7Day' : 'SPOTIFY_7DAY_STREAMS',
    '7Day+' : 'SPOTIFY_7DAY_STREAMS_MOVEMENT',
    'Total' : 'SPOTIFY_TOTAL_STREAMS',
})

# Remove "+" and ensure numeric conversion, allowing NaNs for non-numeric values
df['SPOTIFY_MOVEMENT'] = df['SPOTIFY_MOVEMENT'].str.replace('+', '', regex=False)
df['SPOTIFY_MOVEMENT'] = pd.to_numeric(df['SPOTIFY_MOVEMENT'], errors='coerce')


# Extract numeric values from 'COUNT_AT_PEAK'
df['COUNT_AT_PEAK'] = df['COUNT_AT_PEAK'].str.extract('(\d+)', expand=False)

# Convert to numeric, allowing NaNs to remain
df['COUNT_AT_PEAK'] = pd.to_numeric(df['COUNT_AT_PEAK'], errors='coerce')


df['ON_TOUR'] = np.nan
df['ACTIVE_PROMO'] = np.nan
df['ARIA_LW'] = np.nan
df['ARIA_TW'] = np.nan

# Specify the order of columns
columns_order = [
    'ARTIST_TITLE', 'SPOTIFY_POS', 'SPOTIFY_MOVEMENT',
    'DAYS_IN_CHART', 'SPOTIFY_PEAK', 'COUNT_AT_PEAK', 
    'SPOTIFY_DAILY_STREAMS', 'SPOTIFY_DAILY_STREAMS_MOVEMENT', 
    'SPOTIFY_7DAY_STREAMS', 'SPOTIFY_7DAY_STREAMS_MOVEMENT', 
    'SPOTIFY_TOTAL_STREAMS', 'ON_TOUR', 'ACTIVE_PROMO', 'ARIA_LW', 'ARIA_TW'
]

# Reorder the DataFrame according to the specified column order
df = df[columns_order]

# Create a mapping dictionary where keys are column names and values are desired data types
dtype_mapping = {
    'SPOTIFY_POS': 'Int64',  # Use nullable integer type
    'SPOTIFY_MOVEMENT': 'Int64',
    'DAYS_IN_CHART': 'Int64',
    'SPOTIFY_PEAK': 'Int64',
    'COUNT_AT_PEAK': 'Int64',
    'SPOTIFY_DAILY_STREAMS': 'Int64',
    'SPOTIFY_DAILY_STREAMS_MOVEMENT': 'Int64',
    'SPOTIFY_7DAY_STREAMS': 'Int64',
    'SPOTIFY_7DAY_STREAMS_MOVEMENT': 'Int64',
    'SPOTIFY_TOTAL_STREAMS': 'Int64',
    'ON_TOUR': 'Int64',  # Corrected to nullable integer
    'ACTIVE_PROMO': 'Int64',  # Corrected to nullable integer
    'ARIA_LW': 'Int64',  # Corrected to nullable integer
    'ARIA_TW': 'Int64',  # Corrected to nullable integer
    'ARTIST_TITLE': 'str',
}


# Apply the mapping to the DataFrame
df = df.astype(dtype_mapping)

# If DATE is your index and you want to convert it to datetime
df.index = pd.to_datetime(df.index)


In [3]:
df.head(10)

Unnamed: 0_level_0,ARTIST_TITLE,SPOTIFY_POS,SPOTIFY_MOVEMENT,DAYS_IN_CHART,SPOTIFY_PEAK,COUNT_AT_PEAK,SPOTIFY_DAILY_STREAMS,SPOTIFY_DAILY_STREAMS_MOVEMENT,SPOTIFY_7DAY_STREAMS,SPOTIFY_7DAY_STREAMS_MOVEMENT,SPOTIFY_TOTAL_STREAMS,ON_TOUR,ACTIVE_PROMO,ARIA_LW,ARIA_TW
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-01-30,Noah Kahan - Stick Season,1,,117,1,4.0,358613,11909,2245083,34599,18993056,,,,
2024-01-30,Jack Harlow - Lovin On Me,2,,82,1,72.0,322513,4222,2196751,-30924,26470212,,,,
2024-01-30,Tate McRae - greedy,3,,138,1,9.0,245100,8091,1587211,-2322,31320119,,,,
2024-01-30,"cassö - Prada (w/ RAYE, D-Block Europe)",4,,171,4,,233438,10306,1475683,35515,26972099,,,,
2024-01-30,Taylor Swift - Cruel Summer,5,,360,1,1.0,217606,-2039,1463639,20400,50082515,,,,
2024-01-30,Teddy Swims - Lose Control,6,1.0,88,6,,202485,13159,1242373,7833,8942131,,,,
2024-01-30,Zach Bryan - I Remember Everything (w/ Kacey M...,7,-1.0,159,4,,196059,979,1311455,-2565,26257076,,,,
2024-01-30,Benson Boone - Beautiful Things,8,2.0,12,8,,192669,16808,987639,76230,1587085,,,,
2024-01-30,Sophie Ellis-Bextor - Murder On The Dancefloor,9,-1.0,34,4,,181667,-3913,1326681,-26042,4453921,,,,
2024-01-30,Dom Dolla - Saving Up,10,2.0,63,10,,178875,7807,961793,84262,4299983,,,,


## Possible features to think about...

 - Spotify Popularity Score 
 - ARIA position LW
 - On Tour In Market (yes or no) - Categorical Feature 
 - Promo In Market (yes or no) - Categorical Feature
 - Apple position 
 - Shazam Chart
 - Potentially an aggreagated chart score which averages Apple Chart, Radio Chart, Shazam Chart ?? 
 - YouTube Most Viewed Video ?
 - In Twitter Trending?
 - Grok Twitter Sentiment for the artist

 ## Target Variable?
  - ARIA position TW (target)

In [4]:
# Save to CSV
df.to_csv('main_chart_data.csv', index=False)