In [1]:
import pandas as pd
data = pd.read_csv('./data/data_smogn_new.csv')

In [3]:
from collections import Counter
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
import numpy as np

# Define the binning function for 'popularity'
def get_bin_idx(label, num_bins=10):
    bin_width = 100 / num_bins
    bin_idx = int(label // bin_width)
    return min(bin_idx, num_bins - 1)  # Ensure it does not go out of bounds

# Binning the 'popularity' scores
bin_index_per_label = [get_bin_idx(label) for label in data['popularity']]

# Calculate the number of bins and empirical label distribution
Nb = max(bin_index_per_label) + 1
num_samples_of_bins = dict(Counter(bin_index_per_label))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]

# Define and get the LDS kernel window
def get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

lds_kernel_window = get_lds_kernel_window()

# Apply the convolution to get the effective label distribution
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

eff_label_dist

array([21054, 27128, 34437, 31605, 55869, 68530, 69674, 61900, 52785,
       22825])

In [4]:
bins = np.linspace(0, 100, num=11)  # Adjust based on how 'popularity' was binned
data['bin_index'] = np.digitize(data['popularity'], bins) - 1
data['bin_index'] = data['bin_index'].clip(0, 10-1)   # Assign bins  # total number of samples in your dataset
total_samples = len(data)  # total number of samples in your dataset
weights = 1 / eff_label_dist  # inversely proportional to distribution
weights_normalized = weights / weights.sum() * total_samples
data['weight'] = data['bin_index'].map(lambda x: weights_normalized[x])
data

Unnamed: 0.1,Unnamed: 0,track_id,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,bin_index,weight
0,97826.0,6oF8ueLn5hIl4PRp17sxW6,63.0,178436.0,False,0.500292,0.195348,1.0,0.000,1.0,0.034719,0.823614,0.001574,0.113995,0.191214,138.280590,4.0,show-tunes,6,5436.134904
1,97329.0,6oF8ueLn5hIl4PRp17sxW6,62.0,179176.0,False,0.501938,0.194890,2.0,0.000,1.0,0.036954,0.816710,0.002938,0.111788,0.189084,136.877789,3.0,show-tunes,6,5436.134904
2,98678.0,6oF8ueLn5hIl4PRp17sxW6,62.0,181263.0,False,0.497959,0.198958,2.0,0.000,1.0,0.032760,0.822114,0.002681,0.113876,0.183350,136.125354,3.0,show-tunes,6,5436.134904
3,38535.0,6oF8ueLn5hIl4PRp17sxW6,63.0,385072.0,False,0.080571,0.026935,4.0,0.000,1.0,0.041978,0.941882,0.932783,0.075154,0.014550,190.492789,2.0,show-tunes,6,5436.134904
4,38935.0,6oF8ueLn5hIl4PRp17sxW6,63.0,384702.0,False,0.076992,0.028117,5.0,0.000,1.0,0.041985,0.929955,0.925046,0.071573,0.023929,191.595578,3.0,show-tunes,6,5436.134904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102560,113990.0,2A4dSiJmbviL56CBupkh6C,22.0,369049.0,False,0.579000,0.245000,4.0,-16.357,1.0,0.038400,0.970000,0.924000,0.101000,0.302000,112.011000,3.0,world-music,2,10998.555720
102561,113992.0,3FjOBB4EyIXHYUtSgrIdY9,38.0,312566.0,False,0.475000,0.860000,10.0,-4.722,1.0,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000,4.0,world-music,3,11984.093128
102562,113993.0,4OkMK49i3NApR1KsAIsTf6,39.0,256026.0,False,0.505000,0.687000,10.0,-4.375,1.0,0.028700,0.084100,0.000000,0.188000,0.382000,104.083000,3.0,world-music,3,11984.093128
102563,113994.0,4WbOUe6T0sozC7z5ZJgiAA,22.0,305454.0,False,0.331000,0.171000,1.0,-15.668,1.0,0.035000,0.920000,0.022900,0.067900,0.327000,132.147000,3.0,world-music,2,10998.555720


In [16]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp310-cp310-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ------ --------------------------------- 1.7/10.6 MB 35.5 MB/s eta 0:00:01
   ------------- -------------------------- 3.5/10.6 MB 37.4 MB/s eta 0:00:01
   --------------------- ------------------ 5.8/10.6 MB 40.9 MB/s eta 0:00:01
   ----------------------------- ---------- 7.7/10.6 MB 41.1 MB/s eta 0:00:01
   ---------------------------------- ----- 9.2/10.6 MB 39.5 MB/s eta 0:00:01
   ------------------------------------ --- 9.7/10.6 MB 34.7 MB/s eta 0:00:01
   ------------------------------------- -- 10.0/10.6 MB 31.9 MB/s e

In [5]:
categorical = ['Unnamed: 0','track_id','track_genre','explicit', 'mode', 'key', 'time_signature']
data = data.drop(categorical, axis=1)
data

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,bin_index,weight
0,63.0,178436.0,0.500292,0.195348,0.000,0.034719,0.823614,0.001574,0.113995,0.191214,138.280590,6,5436.134904
1,62.0,179176.0,0.501938,0.194890,0.000,0.036954,0.816710,0.002938,0.111788,0.189084,136.877789,6,5436.134904
2,62.0,181263.0,0.497959,0.198958,0.000,0.032760,0.822114,0.002681,0.113876,0.183350,136.125354,6,5436.134904
3,63.0,385072.0,0.080571,0.026935,0.000,0.041978,0.941882,0.932783,0.075154,0.014550,190.492789,6,5436.134904
4,63.0,384702.0,0.076992,0.028117,0.000,0.041985,0.929955,0.925046,0.071573,0.023929,191.595578,6,5436.134904
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102560,22.0,369049.0,0.579000,0.245000,-16.357,0.038400,0.970000,0.924000,0.101000,0.302000,112.011000,2,10998.555720
102561,38.0,312566.0,0.475000,0.860000,-4.722,0.042100,0.006500,0.000002,0.246000,0.427000,113.949000,3,11984.093128
102562,39.0,256026.0,0.505000,0.687000,-4.375,0.028700,0.084100,0.000000,0.188000,0.382000,104.083000,3,11984.093128
102563,22.0,305454.0,0.331000,0.171000,-15.668,0.035000,0.920000,0.022900,0.067900,0.327000,132.147000,2,10998.555720


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop(['popularity', 'bin_index', 'weight'], axis=1))
y = data['popularity']

# Split data, including weights
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X_scaled, y, data['weight'], test_size=0.2, random_state=42
)

In [21]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 3.2 MB/s eta 0:00:32
   ---------------------------------------- 0.4/99.8 MB 6.5 MB/s eta 0:00:16
   ---------------------------------------- 0.9/99.8 MB 7.7 MB/s eta 0:00:13
    --------------------------------------- 1.3/99.8 MB 8.3 MB/s eta 0:00:12
    --------------------------------------- 1.9/99.8 MB 9.4 MB/s eta 0:00:11
    --------------------------------------- 2.4/99.8 MB 9.4 MB/s eta 0:00:11
   - -------------------------------------- 2.8/99.8 MB 9.3 MB/s eta 0:00:11
   - -------------------------------------- 3.3/99.8 MB 9.5 MB/s eta 0:00:11
   - -------------------------------------- 3.7/99.8 MB 9.3 MB/s eta 0:00:11
   - -------------------------------------- 3.9/99.8 MB 9.3 MB/s eta 0:00:11
   - --------

In [7]:
import xgboost as xgb

# Initialize the XGBoost regressor with weights
regressor = xgb.XGBRegressor(
    n_estimators=100,
    objective='reg:squarederror',
    random_state=42
)

# Train with weights
regressor.fit(X_train, y_train, sample_weight=weights_train)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = regressor.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mse, rmse, r2

(294.9086890497437, 17.172905667060064, 0.5160312949647653)