In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
artists_df = pd.read_csv('../output/artists_with_genres_single_stars.csv')

In [3]:
artists_df = artists_df.drop(['Unnamed: 0'], axis=1)

In [4]:
genres_list = sorted(list(pd.unique(artists_df['genres'])))
def genres_num_gen(row, genres_list=genres_list):
    genre = row['genres']
    return genres_list.index(genre)
artists_df['genre_num'] = artists_df.apply(genres_num_gen, axis=1)

Merging user_profiles with train and artists

In [5]:
train = pd.read_csv('../data/train.csv')
profiles = pd.read_csv('../data/profiles.csv')

In [6]:
countries_list = sorted(list(pd.unique(profiles['country'])))
genders = sorted(list(pd.unique(profiles['sex'])))
age_mean = np.mean(profiles['age'])
def country_num_gen(row, countries_list=countries_list):
    country = row['country']
    return countries_list.index(country)
def gender_num_gen(row, genders=genders):
    sex = row['sex']
    return genders.index(sex)
def age_num_gen(row, age_mean=age_mean):
    lower_bound = 0
    upper_bound = 150
    age = row['age']
    if age < lower_bound or age > upper_bound:
        age = age_mean/upper_bound
    elif str(age) == 'nan':
        age = age_mean/upper_bound
    else:
        age = age/upper_bound
    return age
    
profiles['country_num'] = profiles.apply(country_num_gen, axis=1)
profiles['gender_num'] = profiles.apply(gender_num_gen, axis=1)
profiles['age_num'] = profiles.apply(age_num_gen, axis=1)

In [7]:
train_merge = train.merge(profiles,left_on='user', right_on='user', how='inner')
train_merge = train_merge.merge(artists_df,left_on='artist', \
                                right_on='artist', how='inner')

In [8]:
train_merge.head()

Unnamed: 0,user,artist,plays,sex,age,country,country_num,gender_num,age_num,name,genres,stars,genre_num
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554,m,25,Sweden,204,2,0.166667,Robyn,european,4.65,168
1,0ff4166398f035b5fcb8824cc16c8daeb4643911,5a8e07d5-d932-4484-a7f7-e700793a9c94,169,f,18,United Kingdom,224,1,0.12,Robyn,european,4.65,168
2,b3f9fa56429c3b7fd348c471452e65747ba9ed50,5a8e07d5-d932-4484-a7f7-e700793a9c94,292,m,23,United Kingdom,224,2,0.153333,Robyn,european,4.65,168
3,0ffff52af79555e8fe72289c429b2fdfc8ea684b,5a8e07d5-d932-4484-a7f7-e700793a9c94,92,m,26,Germany,79,2,0.173333,Robyn,european,4.65,168
4,985253be0dc82ffa15a0ad006d0284aa4b7d1e3d,5a8e07d5-d932-4484-a7f7-e700793a9c94,159,m,19,Sweden,204,2,0.126667,Robyn,european,4.65,168


In [9]:
train_set = train_merge[['gender_num','age_num','country_num','genre_num','stars']]
response = train_merge['plays']

In [10]:
RF =  RandomForestRegressor(n_estimators=50, n_jobs=4)

In [11]:
RF.fit(train_set,response)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Running on a test set

In [12]:
test = pd.read_csv('../data/test.csv')
test_merge = test.merge(profiles,left_on='user', right_on='user', how='inner')
test_merge = test_merge.merge(artists_df,left_on='artist', \
                                right_on='artist', how='inner')

In [13]:
test_pred = RF.predict(test_merge[['gender_num','age_num',\
                                   'country_num','genre_num','stars']])

In [14]:
test_merge['plays'] = test_pred

In [15]:
out = test_merge[['Id','plays']]
out = out.set_index('Id')
out.to_csv('../output/submission_RF_stars.csv')