In [None]:
import kagglehub
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (10, 5)
# plt.rcParams['figure.autolayout'] = True

In [None]:
# Download latest version
path1 = kagglehub.dataset_download("tonygordonjr/spotify-dataset-2023")
path2 = kagglehub.dataset_download("yamaerenay/spotify-dataset-19212020-600k-tracks")

target_dir = os.path.join(os.getcwd(), "data")
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

shutil.move(path1, target_dir)
shutil.move(path2, target_dir)

In [None]:
df = pd.read_csv("../data/1/tracks.csv") 
df['release_date'] = pd.to_datetime(df['release_date'], format='mixed').dropna()
df[df['name'] == 'Style'].head()
# df[df['id_artists'].apply(lambda x: '06HL4z0CvFAxyc27GXpf02' in x)].head()

In [None]:
plt.hist(df['popularity'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Song Popularity")
plt.xlabel("Popularity (0–100)")
plt.ylabel("Number of Songs")
plt.xticks(np.arange(0, 101, 10))
plt.show()

In [None]:
num_cols = df.select_dtypes(include=np.number)
corr = num_cols.corr()

plt.figure(figsize=(10, 8))
plt.imshow(corr, cmap='coolwarm', interpolation='nearest', aspect='auto')
plt.colorbar()

plt.xticks(np.arange(corr.shape[1]), corr.columns, rotation=45)
plt.yticks(np.arange(corr.shape[0]), corr.index)
plt.title("Heatmap", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
features = ['danceability', 'energy', 'acousticness', 'tempo']
sample = df.sample(1000, random_state=22)

for f in features:
    x = df[f]
    y = df['popularity']
    
    plt.scatter(x, y, alpha=0.1, s=10)
    
    m, b = np.polyfit(x, y, 1)
    plt.plot(x, m*x + b, color='red')
    
    plt.title(f"{f.title()} vs Popularity (Trendline)")
    plt.xlabel(f.title())
    plt.ylabel("Popularity")
    plt.show()

In [None]:
yearly_pop = df.groupby(df['release_date'].dt.year)['popularity'].mean().reset_index()
plt.plot(yearly_pop['release_date'], yearly_pop['popularity'])
plt.title("Average Popularity of Song from Year")
plt.xlabel("Year")
plt.ylabel("Popularity")
plt.show()