In [75]:
import pandas as pd
import numpy as np
import torch
from fds import FDS
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv('data_smogn_new.csv')
categorical = ['Unnamed: 0','track_id','track_genre','explicit', 'mode', 'key', 'time_signature']
df = df.drop(categorical, axis=1)
df

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,63.0,178436.0,0.500292,0.195348,0.000,0.034719,0.823614,0.001574,0.113995,0.191214,138.280590
1,62.0,179176.0,0.501938,0.194890,0.000,0.036954,0.816710,0.002938,0.111788,0.189084,136.877789
2,62.0,181263.0,0.497959,0.198958,0.000,0.032760,0.822114,0.002681,0.113876,0.183350,136.125354
3,63.0,385072.0,0.080571,0.026935,0.000,0.041978,0.941882,0.932783,0.075154,0.014550,190.492789
4,63.0,384702.0,0.076992,0.028117,0.000,0.041985,0.929955,0.925046,0.071573,0.023929,191.595578
...,...,...,...,...,...,...,...,...,...,...,...
102560,22.0,369049.0,0.579000,0.245000,-16.357,0.038400,0.970000,0.924000,0.101000,0.302000,112.011000
102561,38.0,312566.0,0.475000,0.860000,-4.722,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000
102562,39.0,256026.0,0.505000,0.687000,-4.375,0.028700,0.084100,0.000000,0.188000,0.382000,104.083000
102563,22.0,305454.0,0.331000,0.171000,-15.668,0.035000,0.920000,0.022900,0.067900,0.327000,132.147000


In [76]:
x = df.drop(['popularity'],axis = 1)
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train_numpy = X_train.to_numpy(dtype = np.float32)
y_train_numpy = y_train.to_numpy(dtype = np.float32)

X_train_torch = torch.tensor(X_train_numpy)
y_train_torch = torch.tensor(y_train_numpy)

In [8]:
print(X_train_torch.shape[1])

10


In [90]:
fds = FDS(feature_dim=X_train_torch.shape[1],start_smooth = 2)
X_train_smoothed_torch = fds.smooth(X_train_torch, y_train_torch, epoch=1)
X_train_smoothed = X_train_smoothed_torch.numpy()

regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
regressor.fit(X_train_smoothed, y_train)
X_test_numpy = X_test.to_numpy(dtype = np.float32)
y_test_numpy = y_test.to_numpy(dtype = np.float32)
X_test_torch = torch.tensor(X_test_numpy)
X_test_smoothed_torch = fds.smooth(X_test_torch, torch.tensor(y_test_numpy), epoch=1)  # Same epoch as above
X_test_smoothed = X_test_smoothed_torch.numpy()

y_pred = regressor.predict(X_test_smoothed)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)
print(mse,mse,r2)

263.7888683538074 263.7888683538074 0.5671014054171588


(263.7888683538074, 16.24157838246663, 0.5671014054171588)

In [70]:
# LDS + FDS
from collections import Counter
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
import numpy as np
data = pd.read_csv('data_smogn_new.csv')
categorical = ['Unnamed: 0','track_id','track_genre','explicit', 'mode', 'key', 'time_signature']
data = data.drop(categorical, axis=1)
# Define the binning function for 'popularity'
def get_bin_idx(label, num_bins=10):
    bin_width = 100 / num_bins
    bin_idx = int(label // bin_width)
    return min(bin_idx, num_bins - 1)  # Ensure it does not go out of bounds

# Binning the 'popularity' scores
bin_index_per_label = [get_bin_idx(label) for label in data['popularity']]

# Calculate the number of bins and empirical label distribution
Nb = max(bin_index_per_label) + 1
num_samples_of_bins = dict(Counter(bin_index_per_label))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

# Define and get the LDS kernel window
def get_lds_kernel_window(kernel='triang', ks=5, sigma=2):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

lds_kernel_window = get_lds_kernel_window()

# Apply the convolution to get the effective label distribution
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

eff_label_dist

array([15561, 18564, 21643, 21356, 30848, 42826, 54992, 47825, 29869,
       10654])

In [71]:
bins = np.linspace(0, 100, num=11)  # Adjust based on how 'popularity' was binned
data['bin_index'] = np.digitize(data['popularity'], bins) - 1
data['bin_index'] = data['bin_index'].clip(0, 10-1)   # Assign bins  # total number of samples in your dataset
total_samples = len(data)  # total number of samples in your dataset
weights = 1 / eff_label_dist  # inversely proportional to distribution
weights_normalized = weights / weights.sum() * total_samples
data['weight'] = data['bin_index'].map(lambda x: weights_normalized[x])
data

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,bin_index,weight
0,63.0,178436.0,0.500292,0.195348,0.000,0.034719,0.823614,0.001574,0.113995,0.191214,138.280590,6,4303.759788
1,62.0,179176.0,0.501938,0.194890,0.000,0.036954,0.816710,0.002938,0.111788,0.189084,136.877789,6,4303.759788
2,62.0,181263.0,0.497959,0.198958,0.000,0.032760,0.822114,0.002681,0.113876,0.183350,136.125354,6,4303.759788
3,63.0,385072.0,0.080571,0.026935,0.000,0.041978,0.941882,0.932783,0.075154,0.014550,190.492789,6,4303.759788
4,63.0,384702.0,0.076992,0.028117,0.000,0.041985,0.929955,0.925046,0.071573,0.023929,191.595578,6,4303.759788
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102560,22.0,369049.0,0.579000,0.245000,-16.357,0.038400,0.970000,0.924000,0.101000,0.302000,112.011000,2,10935.284308
102561,38.0,312566.0,0.475000,0.860000,-4.722,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000,3,11082.241912
102562,39.0,256026.0,0.505000,0.687000,-4.375,0.028700,0.084100,0.000000,0.188000,0.382000,104.083000,3,11082.241912
102563,22.0,305454.0,0.331000,0.171000,-15.668,0.035000,0.920000,0.022900,0.067900,0.327000,132.147000,2,10935.284308


In [91]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop(['popularity', 'bin_index', 'weight'], axis=1))
y = data['popularity']

# Split data, including weights
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X_scaled, y, data['weight'], test_size=0.2, random_state=42
)
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_numpy = y_test.to_numpy()
y_test_torch = torch.tensor(y_test_numpy, dtype=torch.float32)

In [92]:
fds = FDS(feature_dim=X_train_torch.shape[1],start_smooth=2)
X_train_smoothed_torch = fds.smooth(X_train_torch, y_train_torch, epoch=1)
X_train_smoothed = X_train_smoothed_torch.numpy()

regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
regressor.fit(X_train_smoothed, y_train, sample_weight=weights_train)
X_test_smoothed_torch = fds.smooth(X_test_torch, y_test_numpy, epoch=1)  # Same epoch as above
X_test_smoothed = X_test_smoothed_torch.numpy()

In [93]:
y_pred = regressor.predict(X_test_smoothed)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)
mse, rmse, r2

(293.51415528595834, 17.13225482200047, 0.5183198362144714)