In [None]:
import kagglehub
import os
from ast import literal_eval
import shutil
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (10, 5)

In [None]:
# # Download latest version
# path1 = kagglehub.dataset_download("tonygordonjr/spotify-dataset-2023")
# path2 = kagglehub.dataset_download("yamaerenay/spotify-dataset-19212020-600k-tracks")

# target_dir = os.path.join(os.getcwd(), "data")
# if not os.path.exists(target_dir):
#     os.makedirs(target_dir)

# shutil.move(path1, target_dir)
# shutil.move(path2, target_dir)

In [None]:
df2020tracks = pd.read_csv("../data/1/tracks.csv") 
df2020artists = pd.read_csv("../data/1/artists.csv")

df2020artists = df2020artists[['id', 'followers', 'popularity']].rename(columns={'id': 'artist_id', "popularity": "artist_popularity"})

df2020tracks['id_artists'] = df2020tracks['id_artists'].apply(literal_eval)
df2020tracks['id_artist_0'] = df2020tracks['id_artists'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
df2020 = pd.merge(df2020tracks, df2020artists, left_on="id_artist_0", right_on="artist_id", how="inner")

df2023 = pd.read_csv(r'..\data\2\spotify_data_12_20_2023.csv')
print(len(df2020.index))
print(len(df2023.index))

In [None]:
df2020['release_date'] = pd.to_datetime(df2020['release_date'], format='mixed')

keep_cols = [
    'track_id', 'track_name', 'artist_0', 'artist_id', 'artist_popularity',
    'danceability', 'energy', 'valence', 'acousticness', 'tempo',
    'speechiness', 'liveness', 'instrumentalness', 'loudness',
    'duration_ms', 'explicit', 'track_popularity', 'release_date',
    'followers'
]

month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

df2023.columns = df2023.columns.str.strip().str.lower()

df2023['release_year'] = pd.to_numeric(df2023['release_year'], errors='coerce').astype('Int64')
df2023['release_month'] = df2023['release_month'].astype(str).str.strip()
df2023['month_num'] = df2023['release_month'].map(month_map)

df2023 = df2023.dropna(subset=['release_year', 'month_num']).copy()

df2023['release_date'] = pd.to_datetime(
    dict(year=df2023['release_year'].astype(int),
         month=df2023['month_num'].astype(int),
         day=1),
    errors='coerce'
)

df2023 = df2023[keep_cols]
df2023 = df2023.rename(columns={
    'track_id': 'id',
    'track_name': 'name',
    'artist_0': 'artist',
    'track_popularity': 'popularity'
})

df2020['explicit'] = df2020['explicit'].astype(str).map({'True': 1, 'False': 0, '1': 1, '0': 0}).fillna(0).astype(int)
df2023['explicit'] = df2023['explicit'].astype(str).map({'True': 1, 'False': 0, '1': 1, '0': 0}).fillna(0).astype(int)

print(df2020.columns)
print(df2023.columns)

In [None]:
df2020 = df2020.drop_duplicates(subset='id')
df2023 = df2023.drop_duplicates(subset='id')

print(len(df2020.index))
print(len(df2023.index))

df2020 = df2020.dropna(subset=['popularity', 'danceability', 'energy', 'valence'])
df2023 = df2023.dropna(subset=['popularity', 'danceability', 'energy', 'valence'])

print(len(df2020.index))
print(len(df2023.index))

df2020 = df2020[
    (df2020['followers'] >= 10_000) &
    (df2020['duration_ms'] >= 60_000) &
    (df2020['duration_ms'] <= 614_000)
]
df2023 = df2023[
    (df2023['followers'] >= 10_000) &
    (df2023['duration_ms'] >= 60_000) &
    (df2023['duration_ms'] <= 614_000)
]

print(len(df2020.index))
print(len(df2023.index))

df2020 = df2020[df2020['tempo'] > 0]
df2023 = df2023[df2023['tempo'] > 0]

print(len(df2020.index))
print(len(df2023.index))

df2020 = df2020[~df2020['name'].str.contains(r'remix|live|instrumental|acoustic|karaoke', case=False, na=False)]
df2023 = df2023[~df2023['name'].str.contains(r'remix|live|instrumental|acoustic|karaoke', case=False, na=False)]

print(len(df2020.index))
print(len(df2023.index))


In [None]:
df2020.to_csv("../data/3/spotify_2020_cleaned.csv", index=False)
df2023.to_csv("../data/3/spotify_2023_cleaned.csv", index=False)

In [None]:
print(df2020.sort_values(by='popularity', ascending=False)[['name', 'artists', 'popularity']].head(5))
# print('\n\n\n\n')
# print(df2023.sort_values(by='popularity', ascending=False)[['name', 'artist', 'popularity']].head(5))


In [None]:
plt.hist(df2020['popularity'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Song Popularity")
plt.xlabel("Popularity (0–100)")
plt.ylabel("Number of Songs")
plt.xticks(np.arange(0, 101, 10))
plt.show()

plt.hist(df2023['popularity'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Song Popularity")
plt.xlabel("Popularity (0–100)")
plt.ylabel("Number of Songs")
plt.xticks(np.arange(0, 101, 10))
plt.show()

In [None]:
# print(df2020[df2020['id_artists'].apply(lambda x: '06HL4z0CvFAxyc27GXpf02' in x)].sort_values(by='popularity', ascending=False)[['name', 'artists', 'popularity']])
print(df2023[df2023['artist_id'] == '06HL4z0CvFAxyc27GXpf02'].sort_values(by='popularity', ascending=False)[['name', 'artist', 'popularity', 'followers']])


In [None]:
num_cols = df2020.select_dtypes(include=np.number)
corr = num_cols.corr()

plt.figure(figsize=(10, 8))
plt.imshow(corr, cmap='coolwarm', interpolation='nearest', aspect='auto')
plt.colorbar()

plt.xticks(np.arange(corr.shape[1]), corr.columns, rotation=45)
plt.yticks(np.arange(corr.shape[0]), corr.index)
plt.title("Heatmap", fontsize=16)
plt.tight_layout()
plt.show()

num_cols = df2023.select_dtypes(include=np.number)
corr = num_cols.corr()

plt.figure(figsize=(10, 8))
plt.imshow(corr, cmap='coolwarm', interpolation='nearest', aspect='auto')
plt.colorbar()
plt.xticks(np.arange(corr.shape[1]), corr.columns, rotation=45)
plt.yticks(np.arange(corr.shape[0]), corr.index)
plt.title("Heatmap", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
features = ['danceability', 'energy', 'acousticness', 'tempo']
sample = df2020.sample(1000, random_state=22)
artist_id = '06HL4z0CvFAxyc27GXpf02'
df2020_artist_songs = df2020[df2020['artist_id'] == artist_id]  
for f in features:
    x = df2020[f]
    y = df2020['popularity']
    
    plt.scatter(x, y, alpha=0.1, s=10)
    
    m, b = np.polyfit(x, y, 1)
    plt.plot(x, m*x + b, color='red')
    
    plt.title(f"{f.title()} vs Popularity (Trendline)")
    plt.xlabel(f.title())
    plt.ylabel("Popularity")
    plt.show()

sample = df2023.sample(1000, random_state=22)

for f in features:
    x = df2023[f]
    y = df2023['popularity']
    
    plt.scatter(x, y, alpha=0.1, s=10)
    
    m, b = np.polyfit(x, y, 1)
    plt.plot(x, m*x + b, color='red')
    
    plt.title(f"{f.title()} vs Popularity (Trendline)")
    plt.xlabel(f.title())
    plt.ylabel("Popularity")
    plt.show()


In [None]:
yearly_pop = df2020.groupby(df2020['release_date'].dt.year)['popularity'].mean().reset_index()
plt.plot(yearly_pop['release_date'], yearly_pop['popularity'])
plt.title("Average Popularity of Song from Year")
plt.xlabel("Year")
plt.ylabel("Popularity")
plt.show()

yearly_pop = df2023.groupby(df2023['release_date'].dt.year)['popularity'].mean().reset_index()
plt.plot(yearly_pop['release_date'], yearly_pop['popularity'])
plt.title("Average Popularity of Song from Year")
plt.xlabel("Year")
plt.ylabel("Popularity")
plt.show()