In [1]:
# https://platform.olimpiada-ai.ro/problems/45

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
train = pd.read_csv("/kaggle/input/song-popularity-prediction/train.csv")
test = pd.read_csv("/kaggle/input/song-popularity-prediction/test.csv")

In [3]:
def process_df(df):
    for col in ['artists', 'album_name', 'track_name']:
        df[col] = df[col].fillna('')
    df['artists'] = df['artists'].map(lambda x: x.replace(';', ' \n '))
    df['explicit'] = df['explicit'].astype(int)
    df['key_cat'] = df['key'].astype(str)
    df['mode_cat'] = df['mode'].astype(str)
    df['time_signature_cat'] = df['time_signature'].astype(str)
    df['speechiness_log'] = np.log10(df['speechiness']*100+1)
    df['acousticness_log'] = np.log10(df['acousticness']*100+1)
    df['instrumentalness_log'] = np.log10(df['instrumentalness']*100+1)
    return df

train = process_df(train)
test = process_df(test)

In [4]:
train.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre', 'key_cat',
       'mode_cat', 'time_signature_cat', 'speechiness_log', 'acousticness_log',
       'instrumentalness_log'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['track_id', 'popularity']]
text_features = ['artists', 'album_name', 'track_name']
cat_features = ['mode_cat', 'key_cat', 'time_signature_cat', 'track_genre']

X, y = train[features], train['popularity']
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_features, text_features=text_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features, text_features=text_features)

In [9]:
from catboost import CatBoostRegressor

params = {
    'iterations': 100000,
    'learning_rate': 1,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 10000,
    'max_depth': 4,
    'random_state': 42,
    'task_type': 'GPU'
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 18.7173148	test: 18.6002449	best: 18.6002449 (0)	total: 9.82ms	remaining: 16m 22s
10000:	learn: 8.0567333	test: 8.1755382	best: 8.1755382 (10000)	total: 55.8s	remaining: 8m 21s
20000:	learn: 7.1921987	test: 7.6473456	best: 7.6473456 (20000)	total: 1m 51s	remaining: 7m 26s
30000:	learn: 6.6631685	test: 7.3994390	best: 7.3994390 (30000)	total: 2m 48s	remaining: 6m 33s
40000:	learn: 6.2711611	test: 7.2373881	best: 7.2373881 (40000)	total: 3m 46s	remaining: 5m 38s
50000:	learn: 5.9719660	test: 7.1320459	best: 7.1320459 (50000)	total: 4m 43s	remaining: 4m 43s
60000:	learn: 5.7335855	test: 7.0558589	best: 7.0558589 (60000)	total: 5m 41s	remaining: 3m 47s
70000:	learn: 5.5415777	test: 7.0010855	best: 7.0010855 (70000)	total: 6m 38s	remaining: 2m 50s
80000:	learn: 5.3789185	test: 6.9536143	best: 6.9536143 (80000)	total: 7m 36s	remaining: 1m 54s
90000:	learn: 5.2456794	test: 6.9216927	best: 6.9216927 (90000)	total: 8m 34s	remaining: 57.1s
99999:	learn: 5.1286237	test: 6.8937939	best: 

<catboost.core.CatBoostRegressor at 0x79825d86c2d0>

In [11]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_valid).flatten()
score = mean_absolute_error(y_valid, y_pred)

print(f"Score: {score:.5f}")

Score: 6.89378


In [13]:
y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'track_id': test['track_id'],
    'popularity': y_pred
})

subm.head()

Unnamed: 0,track_id,popularity
0,4zI0WV1Qz7c7BJfQWVFBNO,14.646352
1,7Meqm9jgQXhh08vdnbrvfY,35.894475
2,4JnX8tY9me7jHqDwgifhwU,56.103065
3,7me5d5XmtCCfbDk3SsyXqM,51.493937
4,30Hm5tUVPVoJzGIEBchOT3,10.550152


In [14]:
subm.to_csv("submission.csv", index=False)