In [4]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import gc
data_path = '/Users/alvira/Desktop/ml1/kaggle/music/'

In [2]:
print('Loading data...')
data_path = '/Users/alvira/Desktop/ml1/kaggle/music/'
train = pd.read_csv(data_path + 'train.csv', dtype={'msno' : 'object',
                                                'source_system_tab' : 'object',
                                                  'source_screen_name' : 'object',
                                                  'source_type' : 'object',
                                                  'target' : np.uint8,
                                                  'song_id' : 'object'})
test = pd.read_csv(data_path + 'test.csv', dtype={'msno' : 'object',
                                                'source_system_tab' : 'object',
                                                'source_screen_name' : 'object',
                                                'source_type' : 'object',
                                                'song_id' : 'object'})
songs = pd.read_csv(data_path + 'songs.csv',dtype={'genre_ids': 'object',
                                                  'language' : 'object',
                                                  'artist_name' : 'object',
                                                  'composer' : 'object',
                                                  'lyricist' : 'object',
                                                  'song_id' : 'object'})
members = pd.read_csv(data_path + 'members.csv',dtype={'city' : 'object',
                                                      'bd' : np.uint8,
                                                      'gender' : 'object',
                                                      'registered_via' : 'object'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done loading...')

Loading data...
Done loading...


In [3]:
def object2cat(df):
    object_cols = list(df.select_dtypes(include=['object']).columns)
    for col in object_cols:
        df[col]=df[col].astype('category')
object2cat(train)
object2cat(test)
object2cat(songs)
object2cat(members)

In [4]:
print('Data merging...')


train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')


test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')
import gc
del members, songs; gc.collect();

print('Done merging...')

Data merging...
Done merging...


In [5]:
train.dtypes

msno                    object
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
song_length             uint32
genre_ids             category
artist_name           category
composer              category
lyricist              category
language              category
city                  category
bd                       uint8
gender                category
registered_via        category
expiration_date          int64
membership_days          int64
registration_year        int64
registration_month       int64
registration_date        int64
expiration_year          int64
expiration_month         int64
song_year              float64
dtype: object

In [None]:
print ("Adding new features")

def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1


train['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
test['genre_ids'].cat.add_categories('no_genre_id').fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
test['lyricist'].cat.add_categories('no_lyricist').fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)

def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
test['composer'].cat.add_categories('no_composer').fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
test['artist_name'].cat.add_categories('no_artist').fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)

In [10]:
train['artist_name'].astype("object") == train['composer'].astype("object")

0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23          True
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
7377388    False
7377389    False
7377390    False
7377391    False
7377392    False
7377393    False
7377394    False
7377395    False
7377396    False
7377397    False
7377398    False
7377399    False
7377400    False
7377401     True
7377402    False
7377403    False
7377404    False
7377405    False
7377406    False
7377407    False
7377408    False
7377409    False
7377410    False
7377411    False
7377412    False
7377413    False
7377414    False
7377415    Fal

In [11]:
# if artist is same as composer
train['artist_composer'] = (train['artist_name'].astype("object") == train['composer'].astype("object")).astype(np.int8)
test['artist_composer'] = (test['artist_name'].astype("object") == test['composer'].astype("object")).astype(np.int8)


# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'].astype("object") == train['composer'].astype("object")) & (train['artist_name'].astype("object") == train['lyricist'].astype("object")) & (train['composer'].astype("object") == train['lyricist'].astype("object"))).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'].astype("object") == test['composer'].astype("object")) & (test['artist_name'].astype("object") == test['lyricist'].astype("object")) & (test['composer'].astype("object") == test['lyricist'].astype("object"))).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)


_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)


print("Done adding features")

Done adding features


In [None]:
print ("Train test and validation sets")
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')


X_train = train.drop(['target'], axis=1)
y_train = train['target'].values


X_test = test.drop(['id'], axis=1)
ids = test['id'].values


# del train, test; gc.collect();

d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)
print('Processed data...')

In [None]:
train.to_csv(data_path+"processed_train_1.csv")
test.to_csv(data_path+"test_1.csv")

In [None]:
train = pd.read_csv(data_path+"processed_train_1.csv")
test = pd.read_csv(data_path+"test_1.csv")

In [13]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')
        
X_train = train.drop(['target'], axis=1)
y_train = train['target'].values


X_test = test.drop(['id'], axis=1)
ids = test['id'].values
d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)

In [13]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)



[5]	valid_0's auc: 0.732289
[10]	valid_0's auc: 0.744243
[15]	valid_0's auc: 0.749907
[20]	valid_0's auc: 0.75449
[25]	valid_0's auc: 0.758073
[30]	valid_0's auc: 0.761668
[35]	valid_0's auc: 0.764899
[40]	valid_0's auc: 0.767419
[45]	valid_0's auc: 0.769794
[50]	valid_0's auc: 0.772216
[55]	valid_0's auc: 0.774278
[60]	valid_0's auc: 0.776162
[65]	valid_0's auc: 0.77792
[70]	valid_0's auc: 0.779679
[75]	valid_0's auc: 0.781276
[80]	valid_0's auc: 0.782856
[85]	valid_0's auc: 0.78435
[90]	valid_0's auc: 0.785883
[95]	valid_0's auc: 0.787045
[100]	valid_0's auc: 0.788105
[105]	valid_0's auc: 0.78923
[110]	valid_0's auc: 0.790446
[115]	valid_0's auc: 0.791654
[120]	valid_0's auc: 0.792567
[125]	valid_0's auc: 0.793635
[130]	valid_0's auc: 0.794609
[135]	valid_0's auc: 0.795625
[140]	valid_0's auc: 0.796467
[145]	valid_0's auc: 0.797273
[150]	valid_0's auc: 0.798051
[155]	valid_0's auc: 0.79887
[160]	valid_0's auc: 0.799713
[165]	valid_0's auc: 0.800497
[170]	valid_0's auc: 0.801472
[175]

In [14]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f2 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

[5]	valid_0's auc: 0.732289
[10]	valid_0's auc: 0.741758
[15]	valid_0's auc: 0.748537
[20]	valid_0's auc: 0.753688
[25]	valid_0's auc: 0.75818
[30]	valid_0's auc: 0.761594
[35]	valid_0's auc: 0.763197
[40]	valid_0's auc: 0.765998
[45]	valid_0's auc: 0.767162
[50]	valid_0's auc: 0.768695
[55]	valid_0's auc: 0.771585
[60]	valid_0's auc: 0.772915
[65]	valid_0's auc: 0.774763
[70]	valid_0's auc: 0.775261
[75]	valid_0's auc: 0.776247
[80]	valid_0's auc: 0.776816
[85]	valid_0's auc: 0.776734
[90]	valid_0's auc: 0.777526
[95]	valid_0's auc: 0.778664
[100]	valid_0's auc: 0.77964
[105]	valid_0's auc: 0.780192
[110]	valid_0's auc: 0.779888
[115]	valid_0's auc: 0.781764
[120]	valid_0's auc: 0.782169
[125]	valid_0's auc: 0.782851
[130]	valid_0's auc: 0.783813
[135]	valid_0's auc: 0.784713
[140]	valid_0's auc: 0.785098
[145]	valid_0's auc: 0.785466
[150]	valid_0's auc: 0.785916
[155]	valid_0's auc: 0.786436
[160]	valid_0's auc: 0.78593
[165]	valid_0's auc: 0.787615
[170]	valid_0's auc: 0.78873
[175

In [20]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'learning_rate': 0.2 ,
        'verbose': 0,
        'num_leaves': 100,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'num_rounds': 200,
        'metric' : 'auc'
    }
%time model_f3 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)



[5]	valid_0's auc: 0.750656
[10]	valid_0's auc: 0.767176
[15]	valid_0's auc: 0.780871
[20]	valid_0's auc: 0.79207
[25]	valid_0's auc: 0.798997
[30]	valid_0's auc: 0.805066
[35]	valid_0's auc: 0.809566
[40]	valid_0's auc: 0.813247
[45]	valid_0's auc: 0.815732
[50]	valid_0's auc: 0.818926
[55]	valid_0's auc: 0.821573
[60]	valid_0's auc: 0.823882
[65]	valid_0's auc: 0.825931
[70]	valid_0's auc: 0.827648
[75]	valid_0's auc: 0.829214
[80]	valid_0's auc: 0.831138
[85]	valid_0's auc: 0.832627
[90]	valid_0's auc: 0.834122
[95]	valid_0's auc: 0.835225
[100]	valid_0's auc: 0.836374
[105]	valid_0's auc: 0.837527
[110]	valid_0's auc: 0.838872
[115]	valid_0's auc: 0.840052
[120]	valid_0's auc: 0.841375
[125]	valid_0's auc: 0.842399
[130]	valid_0's auc: 0.843378
[135]	valid_0's auc: 0.844201
[140]	valid_0's auc: 0.84481
[145]	valid_0's auc: 0.845411
[150]	valid_0's auc: 0.846337
[155]	valid_0's auc: 0.847064
[160]	valid_0's auc: 0.847848
[165]	valid_0's auc: 0.848659
[170]	valid_0's auc: 0.849224
[1

In [21]:
print('Making predictions')
# p_test_1 = model_f1.predict(X_test)
# p_test_2 = model_f2.predict(X_test)
p_test_3 = model_f3.predict(X_test)
# p_test_avg1 = np.mean([p_test_1, p_test_2,  p_test_3], axis = 0)
# p_test_avg2 = np.mean([p_test_1, p_test_2], axis = 0)

Making predictions


In [22]:
# Writing output to file
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_3
subm.to_csv(data_path + 'lgbm_submission_3.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')

print('Done!')

Done!


In [29]:
subm['target'] = np.mean([p_test_avg1, p_test_3], axis = 0)
subm.to_csv(data_path + 'lgbm_submission_my.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')

In [24]:
subm['target'] = p_test_avg2
subm.to_csv(data_path + 'lgbm_submission_as2.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')

In [34]:
X_train[:5]

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,lyricist,...,lyricists_count,composer_count,is_featured,artist_count,artist_composer,artist_composer_lyricist,song_lang_boolean,smaller_song,count_song_played,count_artist_played
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,206471,359,Bastille,Dan Smith| Mark Crew,,...,1,2,0,0,0,0,0,1,215,1140
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,284584,1259,Various Artists,,,...,1,1,0,0,0,0,0,0,1,303616
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,225396,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,...,1,1,0,0,0,0,0,1,4,289
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,255512,1019,Soundway,Kwadwo Donkoh,,...,1,1,0,0,0,0,0,0,1,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,187802,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,...,1,3,0,0,0,0,0,1,412,427
