In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

(based on RateYourMusic Top5000 dataset)

**Downloading data**

In [None]:
df = pd.read_csv('/kaggle/input/rym-top-5000/rym_clean1.csv', index_col='position')
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df

**Creating genres and descriptors features**

In [None]:
df.primary_genres = df.primary_genres.apply(lambda x: set(str(x).split(',')))
df.secondary_genres = df.secondary_genres.apply(lambda x: set(str(x).split(',')))
df.descriptors = df.descriptors.apply(lambda x: set(str(x).split(',')))

for i in range(1, 5001):
    df.primary_genres[i] = {x.strip() for x in df.primary_genres[i]}
    df.secondary_genres[i] = {x.strip() for x in df.secondary_genres[i]}
    df.descriptors[i] = {x.strip() for x in df.descriptors[i]}

In [None]:
genres = set()

for i in range(1,5001):
    genres.update(df.primary_genres[i])
            
for i in range(1,5001):
    genres.update(df.secondary_genres[i])

genres = {x.strip() for x in genres}  

desc = set()

for i in range(1,5001):
    desc.update(df.descriptors[i])
    
desc = {x.strip() for x in desc}

In [None]:
pr_col_list = []
sec_col_list = []
for genre in genres:
    df[genre + '_pr'] = df.primary_genres.apply(lambda x: int(genre in x))
    pr_col_list.append(genre + '_pr')
    df[genre + '_sec'] = df.secondary_genres.apply(lambda x: int(genre in x))
    sec_col_list.append(genre + '_sec')

des_col_list = [] 
for des in desc:
    df[des + '_desc'] = df.descriptors.apply(lambda x: int(des in x))
    des_col_list.append(des + '_desc')

**Creating year and month features**

In [None]:
df.release_date = pd.to_datetime(df.release_date)
df['year'] = df.release_date.dt.year
df['month'] = df.release_date.dt.month

In [None]:
df

**Top genres and descriptors**

In [None]:
df_pr = df[pr_col_list].sum().sort_values(ascending=False)
top_pr = list(df_pr[:100].index)

df_sec = df[sec_col_list].sum().sort_values(ascending=False)
top_sec = list(df_sec[:100].index)

df_des = df[des_col_list].sum().sort_values(ascending=False)
top_des = list(df_des[:100].index)

**Defining target and train data**

In [None]:
target = df['avg_rating']
# target = df['rating_count']
df = df.drop(['release_name', 'artist_name', 'release_date', 'release_type', 'primary_genres', 'secondary_genres', 
             'descriptors', 'avg_rating', 'rating_count', 'review_count'], axis=1)
col_list = df.columns

**Defining model**

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import catboost

MAX_ITER = 5000
PATIENCE = 100
DISPLAY_FREQ = 100

MODEL_PARAMS = {'random_seed': 1234,    
                'learning_rate': 0.1,                
                'iterations': MAX_ITER,
                'early_stopping_rounds': PATIENCE,
                'metric_period': DISPLAY_FREQ,
#                 'use_best_model': True,
                'eval_metric': 'RMSE',
#                 'task_type': 'GPU'
               }

model = catboost.CatBoostRegressor(**MODEL_PARAMS)
# model = Ridge()
# model = RandomForestRegressor()

model.fit(df, target)

In [None]:
def pr_rating(genres, year = 2022, month = 10):
    pr_data = pd.DataFrame(np.zeros((1, df.shape[1])), columns=col_list)
    pr_data.year = year
    pr_data.month = month
    for genre in genres:
        pr_data[genre] = 1
    prediction = model.predict(pr_data)
    return prediction

def pr_rating_sort(genre):
    return pr_rating([genre])


In [None]:
# pr_col_list.sort(reverse=True, key=pr_rating_sort)
# sec_col_list.sort(reverse=True, key=pr_rating_sort)
# des_col_list.sort(reverse=True, key=pr_rating_sort)

top_pr.sort(reverse=True, key=pr_rating_sort)
top_sec.sort(reverse=True, key=pr_rating_sort)
top_des.sort(reverse=True, key=pr_rating_sort)

In [None]:
pred = dict()
for genre1 in top_pr[:10]:
    for genre2 in top_sec[:10]:
        for desc in top_des[:10]:
            pred[(genre1, genre2, desc)] = pr_rating([genre1, genre2, desc])
        

In [None]:
pred_df = pd.DataFrame(pred)

**Most successful/popular music today :)**

In [None]:
pred_df.T.sort_values(by=[0], ascending=False).rename(columns={0:'rating'})[:20]

In [None]:
df.groupby('year').month.count().plot()

In [None]:
df.groupby('month').year.count().plot()