In [10]:
from collections import Counter
import torch
from utils.fds import FDS
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [16]:
data = pd.read_csv('./data/data_smogn_05.csv')

# LDS

In [17]:
# Define the binning function for 'popularity'
def get_bin_idx(label, num_bins=10):
    bin_width = 100 / num_bins
    bin_idx = int(label // bin_width)
    return min(bin_idx, num_bins - 1)  # Ensure it does not go out of bounds

# Binning the 'popularity' scores
bin_index_per_label = [get_bin_idx(label) for label in data['popularity']]

# Calculate the number of bins and empirical label distribution
Nb = max(bin_index_per_label) + 1
num_samples_of_bins = dict(Counter(bin_index_per_label))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

# Define and get the LDS kernel window
def get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

lds_kernel_window = get_lds_kernel_window()

# Apply the convolution to get the effective label distribution
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

eff_label_dist

array([22111, 28429, 35966, 32890, 40154, 58912, 57908, 63167, 56074,
       43001])

In [18]:
bins = np.linspace(0, 100, num=11)  # Adjust based on how 'popularity' was binned
data['bin_index'] = np.digitize(data['popularity'], bins) - 1
data['bin_index'] = data['bin_index'].clip(0, 10-1)   # Assign bins  # total number of samples in your dataset
total_samples = len(data)  # total number of samples in your dataset
weights = 1 / eff_label_dist  # inversely proportional to distribution
weights_normalized = weights / weights.sum() * total_samples
data['weight'] = data['bin_index'].map(lambda x: weights_normalized[x])
data

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,bin_index,weight
0,67.0,351413.0,0.437119,0.496777,0.000,0.038004,0.621056,0.055218,0.077442,0.329562,127.922188,6,7212.505044
1,68.0,351783.0,0.427469,0.615662,0.000,0.028970,0.348779,0.178368,0.102032,0.201814,92.113026,6,7212.505044
2,68.0,351661.0,0.509178,0.543150,0.000,0.027034,0.234267,0.110648,0.084248,0.328488,114.429474,6,7212.505044
3,67.0,351394.0,0.437674,0.490773,0.000,0.038458,0.635044,0.048935,0.076265,0.335983,129.716280,6,7212.505044
4,68.0,351567.0,0.572096,0.487288,0.000,0.025623,0.153533,0.057433,0.071576,0.424923,131.394058,6,7212.505044
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106364,40.0,246306.0,0.470000,0.938000,-4.722,0.105000,0.000529,0.000000,0.251000,0.453000,128.002000,4,10401.497785
106365,38.0,312566.0,0.475000,0.860000,-4.722,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000,3,12698.745578
106366,21.0,384999.0,0.172000,0.235000,-16.393,0.042200,0.640000,0.928000,0.086300,0.033900,125.995000,2,11612.682591
106367,41.0,283893.0,0.587000,0.506000,-10.889,0.029700,0.381000,0.000000,0.270000,0.413000,135.960000,4,10401.497785


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

# Normalize features
scaler = Normalizer()
X_scaled = scaler.fit_transform(data.drop(['popularity', 'bin_index', 'weight'], axis=1))
y = data['popularity']

# Split data, including weights
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X_scaled, y, data['weight'], test_size=0.2, random_state=42
)

# FDS

In [20]:
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_numpy = y_test.to_numpy()
y_test_torch = torch.tensor(y_test_numpy, dtype=torch.float32)

fds = FDS(feature_dim = X_train_torch.shape[1],start_smooth=2)
X_train_smoothed_torch = fds.smooth(X_train_torch, y_train_torch, epoch=1)
X_train_smoothed = X_train_smoothed_torch.numpy()

regressorDIR = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
regressorDIR.fit(X_train_smoothed, y_train, sample_weight=weights_train)
X_test_smoothed_torch = fds.smooth(X_test_torch, y_test_numpy, epoch=1)  # Same epoch as above
X_test_smoothed = X_test_smoothed_torch.numpy()

AssertionError: Torch not compiled with CUDA enabled

In [None]:
y_pred = regressorDIR.predict(X_test_smoothed)
mse = mean_squared_error(y_test, y_pred, sample_weights = weights_test)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred, sample_weights = weights_test)
mse, rmse, r2