In [140]:
import pandas as pd
data = pd.read_csv('./data/dataset.csv')
print(data.shape)

(114000, 20)


In [43]:
# LDS 
import pandas as pd
from collections import Counter
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
import numpy as np
data = pd.read_csv('./data/data_smogn_05.csv')
# unstructured = ['track_id', 'artists', 'album_name', 'track_name', 'track_genre']
# categorical = ['explicit', 'mode', 'key', 'time_signature']
# data = data.drop(unstructured,axis = 1)
# data = data.drop(categorical,axis=1)
# Define the binning function for 'popularity'
def get_bin_idx(label, num_bins=10):
    bin_width = 100 / num_bins
    bin_idx = int(label // bin_width)
    return min(bin_idx, num_bins - 1)  # Ensure it does not go out of bounds

# Binning the 'popularity' scores
bin_index_per_label = [get_bin_idx(label) for label in data['popularity']]

# Calculate the number of bins and empirical label distribution
Nb = max(bin_index_per_label) + 1
num_samples_of_bins = dict(Counter(bin_index_per_label))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

# Define and get the LDS kernel window
def get_lds_kernel_window(kernel='triang', ks=5, sigma=2):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

lds_kernel_window = get_lds_kernel_window()

# Apply the convolution to get the effective label distribution
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

eff_label_dist

array([ 8350,  6413,  3892,  1143,   795,   709,   683,   695,   747,
         848,   983,  1064,  1059,   961,   917,  1008,  1272,  1607,
        2014,  2434,  2850,  3093,  3111,  2893,  2615,  2359,  2214,
        2092,  2005,  1896,  1818,  1769,  1787,  1848,  1931,  2041,
        2181,  2348,  2483,  2579,  2632,  2689,  2762,  2835,  2865,
        2799,  2684,  2548,  2435,  2332,  2218,  2100,  2001,  1946,
        1959,  1990,  2040,  2035,  1981,  1847,  1699,  1565,  1480,
        1411,  1338,  1906,  3446,  6223,  9052, 11269, 12470, 12751,
       12441, 11415, 10274,  8791,  7318,  5681,  4377,  3247,  2541,
        1932,  1592,  1207,   894,   597,   413,   304,   231,   168,
         113,    68,    47,    40,    42,    37,    29,  4748,  9469,
       14189])

In [44]:
bins = np.linspace(0, 100, num=11)  # Adjust based on how 'popularity' was binned
data['bin_index'] = np.digitize(data['popularity'], bins) - 1
data['bin_index'] = data['bin_index'].clip(0, 10-1)   # Assign bins  # total number of samples in your dataset
total_samples = len(data)  # total number of samples in your dataset
weights = 1 / eff_label_dist  # inversely proportional to distribution
weights_normalized = weights / weights.sum() * total_samples
data['weight'] = data['bin_index'].map(lambda x: weights_normalized[x])
data

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,bin_index,weight
0,67.0,351413.0,0.437119,0.496777,0.000,0.038004,0.621056,0.055218,0.077442,0.329562,127.922188,67,78.331965
1,68.0,351783.0,0.427469,0.615662,0.000,0.028970,0.348779,0.178368,0.102032,0.201814,92.113026,68,53.851063
2,68.0,351661.0,0.509178,0.543150,0.000,0.027034,0.234267,0.110648,0.084248,0.328488,114.429474,68,53.851063
3,67.0,351394.0,0.437674,0.490773,0.000,0.038458,0.635044,0.048935,0.076265,0.335983,129.716280,67,78.331965
4,68.0,351567.0,0.572096,0.487288,0.000,0.025623,0.153533,0.057433,0.071576,0.424923,131.394058,68,53.851063
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106364,40.0,246306.0,0.470000,0.938000,-4.722,0.105000,0.000529,0.000000,0.251000,0.453000,128.002000,40,185.205099
106365,38.0,312566.0,0.475000,0.860000,-4.722,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000,38,196.318896
106366,21.0,384999.0,0.172000,0.235000,-16.393,0.042200,0.640000,0.928000,0.086300,0.033900,125.995000,21,157.600976
106367,41.0,283893.0,0.587000,0.506000,-10.889,0.029700,0.381000,0.000000,0.270000,0.413000,135.960000,41,181.279219


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop(['popularity', 'bin_index', 'weight'], axis=1))
X = data.drop(['popularity','bin_index','weight'],axis = 1)
y = data['popularity']

# Split data, including weights
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X_scaled, y, data['weight'], test_size=0.2, 
)

In [46]:
import xgboost as xgb

# Initialize the XGBoost regressor with weights
regressorLDS = xgb.XGBRegressor(
    n_estimators=100,
    objective='reg:squarederror',
    random_state=42
)

# Train with weights
regressorLDS.fit(X_train, y_train, sample_weight=weights_train)

regressor = xgb.XGBRegressor(
    n_estimators=100,
    objective='reg:squarederror',
    random_state=42
)

# Train with weights
regressor.fit(X_train, y_train, )


In [49]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = regressorLDS.predict(X_test)
mse = mean_squared_error(y_test, y_pred, sample_weight=weights_test)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred, sample_weight=weights_test)

mse, rmse, r2

(341.12120181294745, 18.469466744141464, 0.6111815460203474)

In [48]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mse, rmse, r2

(320.32690047439564, 17.897678633677486, 0.6900472628487972)