In [None]:
# LDS + FDS
from collections import Counter
import torch
from fds import FDS
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
data = pd.read_csv('./data/dataset.csv')
categorical = ['Unnamed: 0','track_id','track_genre','explicit', 'mode', 'key', 'time_signature']
data = data.drop(categorical, axis=1)
# Define the binning function for 'popularity'
def get_bin_idx(label, num_bins=10):
    bin_width = 100 / num_bins
    bin_idx = int(label // bin_width)
    return min(bin_idx, num_bins - 1)  # Ensure it does not go out of bounds

# Binning the 'popularity' scores
bin_index_per_label = [get_bin_idx(label) for label in data['popularity']]

# Calculate the number of bins and empirical label distribution
Nb = max(bin_index_per_label) + 1
num_samples_of_bins = dict(Counter(bin_index_per_label))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

# Define and get the LDS kernel window
def get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

lds_kernel_window = get_lds_kernel_window()

# Apply the convolution to get the effective label distribution
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

eff_label_dist

In [None]:
bins = np.linspace(0, 100, num=11)  # Adjust based on how 'popularity' was binned
data['bin_index'] = np.digitize(data['popularity'], bins) - 1
data['bin_index'] = data['bin_index'].clip(0, 10-1)   # Assign bins  # total number of samples in your dataset
total_samples = len(data)  # total number of samples in your dataset
weights = 1 / eff_label_dist  # inversely proportional to distribution
weights_normalized = weights / weights.sum() * total_samples
data['weight'] = data['bin_index'].map(lambda x: weights_normalized[x])
data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop(['popularity', 'bin_index', 'weight'], axis=1))
y = data['popularity']

# Split data, including weights
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X_scaled, y, data['weight'], test_size=0.2, random_state=42
)
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_numpy = y_test.to_numpy()
y_test_torch = torch.tensor(y_test_numpy, dtype=torch.float32)

In [None]:
fds = FDS(feature_dim=X_train_torch.shape[1],start_smooth=2)
X_train_smoothed_torch = fds.smooth(X_train_torch, y_train_torch, epoch=1)
X_train_smoothed = X_train_smoothed_torch.numpy()

regressorDIR = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
regressorDIR.fit(X_train_smoothed, y_train, sample_weight=weights_train)
X_test_smoothed_torch = fds.smooth(X_test_torch, y_test_numpy, epoch=1)  # Same epoch as above
X_test_smoothed = X_test_smoothed_torch.numpy()

In [None]:
y_pred = regressorDIR.predict(X_test_smoothed)
mse = mean_squared_error(y_test, y_pred, sample_weights = weights_test)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred, sample_weights = weights_test)
mse, rmse, r2