In [3]:
import pandas as pd
import requests
from io import StringIO
from sklearn.feature_selection import mutual_info_regression

# Get the Data
url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
response = requests.get(url)
data = StringIO(response.text)

# Read the CSV file into a DataFrame
spotify_songs = pd.read_csv(data)

# Drop rows containing null values (Only 5 rows out of 30,000)
spotify_songs = spotify_songs.dropna()

# Convert 'release_date' to datetime
spotify_songs['track_album_release_date'] = pd.to_datetime(spotify_songs['track_album_release_date'], errors='coerce')

# Extract the year from 'release_date'
spotify_songs['release_year'] = spotify_songs['track_album_release_date'].dt.year.astype('Int64')

# Drop the original 'release_date' column
spotify_songs = spotify_songs.drop(columns=['track_album_release_date'])

# Select relevant columns
advanced_attributes = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

# Assuming you have a 'popularity' column as the target variable
target_variable = 'track_popularity'

# Select relevant columns (including the target variable)
selected_columns = advanced_attributes + [target_variable]

# Filter data for the years 1960-1969
selected_data = spotify_songs[(spotify_songs['release_year'] >= 1960) & (spotify_songs['release_year'] <= 1969)]
information_by_attribute = {}
# Calculate mutual information for each attribute
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
# Calculate Information Value for each attribute
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
# Normalize the values
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
# Display normalized Information Value for each attribute
print("Normalized Information Value for Each Attribute in the 60s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")


print()

# Filter data for the years 1970-1979
selected_data = spotify_songs[(spotify_songs['release_year'] >= 1970) & (spotify_songs['release_year'] <= 1979)]
information_by_attribute = {}
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
print("Normalized Information Value for Each Attribute in the 70s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")

print()

# Filter data for the years 1980-1989
selected_data = spotify_songs[(spotify_songs['release_year'] >= 1980) & (spotify_songs['release_year'] <= 1989)]
information_by_attribute = {}
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
print("Normalized Information Value for Each Attribute in the 80s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")

print()

# Filter data for the years 1990-1999
selected_data = spotify_songs[(spotify_songs['release_year'] >= 1990) & (spotify_songs['release_year'] <= 1999)]
information_by_attribute = {}
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
print("Normalized Information Value for Each Attribute in the 90s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")

print()

# Filter data for the years 2000-2009
selected_data = spotify_songs[(spotify_songs['release_year'] >= 2000) & (spotify_songs['release_year'] <= 2009)]
information_by_attribute = {}
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
print("Normalized Information Value for Each Attribute in the 2000s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")

print()

# Filter data for the years 2010-2020
selected_data = spotify_songs[(spotify_songs['release_year'] >= 2010) & (spotify_songs['release_year'] <= 2020)]
information_by_attribute = {}
for attribute in advanced_attributes:
    information_value = mutual_info_regression(selected_data[[attribute]], selected_data[target_variable])[0]
    information_by_attribute[attribute] = information_value
for attribute, value in information_by_attribute.items():
    total_information = sum(information_by_attribute.values())
normalized_information = {attribute: value / total_information for attribute, value in information_by_attribute.items()}
print("Normalized Information Value for Each Attribute in the 2010s:")
for attribute, value in normalized_information.items():
    print(f"{attribute}: {value}")


Normalized Information Value for Each Attribute in the 60s:
danceability: 0.20816123313321178
energy: 0.04965286272005288
key: 0.044472158540738715
loudness: 0.13882484843508963
mode: 0.018873253353303705
speechiness: 0.10218612260435249
acousticness: 0.11704780078267146
instrumentalness: 0.11063811342433671
liveness: 0.027245008085576908
valence: 0.04739049094128216
tempo: 0.03685961506357491
duration_ms: 0.09864849291580874

Normalized Information Value for Each Attribute in the 70s:
danceability: 0.11585095935701552
energy: 0.09579673437368197
key: 0.03093366103364828
loudness: 0.0887334944878867
mode: 0.01568300220170778
speechiness: 0.08395898493218214
acousticness: 0.07815241945152604
instrumentalness: 0.08692282662269228
liveness: 0.09903884900334763
valence: 0.08684269129203731
tempo: 0.08114206955807238
duration_ms: 0.13694430768620197

Normalized Information Value for Each Attribute in the 80s:
danceability: 0.09618818476589094
energy: 0.10533977643879018
key: 0.0543859451386