# WSDM - KKBox's Music Recommendation Challenge

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the data

In [38]:
data = pd.read_csv("./processed/train.csv.gz", compression="gzip")
test = pd.read_csv("./processed/test.csv.gz", compression="gzip")
user = pd.read_csv("./processed/members.csv.gz", compression="gzip")
song = pd.read_csv("./processed/songs.csv.gz", compression="gzip")
extraSong = pd.read_csv("./processed/song_extra_info.csv.gz", compression="gzip")

## Merge all dataset into a single table

Merge train data and eval data

In [39]:
data = pd.merge(data, user, on='msno', how='left')
data = pd.merge(data, song, on='song_id', how='left')
data = pd.merge(data, extraSong, on='song_id', how='left')
data = data.fillna(-1)

Merge test data

In [40]:
test = pd.merge(test, user, on='msno', how='left')
test = pd.merge(test, song, on='song_id', how='left')
test = pd.merge(test, extraSong, on='song_id', how='left')
test = test.fillna(-1)

## Have a look at the entire table and columns for the prediction

In [41]:
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,city,bd,gender,registered_via,...,expiration_date,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,ccodes
0,9176,474849,1,7,6,1,0,2,2,2,...,20171005,45944.0,535.0,18678.0,68708.0,82979.0,8.0,316725.0,GBUM71602854,5981.0
1,19273,1425656,3,8,4,1,11,21,0,3,...,20170911,76229.0,176.0,195506.0,286899.0,82979.0,8.0,498977.0,US3C69910183,11873.0
2,19273,768950,3,8,4,1,11,21,0,3,...,20170911,53693.0,176.0,129054.0,210645.0,82979.0,8.0,351234.0,USUM70618761,13443.0
3,19273,150624,3,8,4,1,11,21,0,3,...,20170911,65705.0,7.0,167740.0,170530.0,82979.0,0.0,213919.0,GBUQH1000063,5994.0
4,9176,210388,1,7,6,1,0,2,2,2,...,20171005,38606.0,3.0,25820.0,40768.0,82979.0,8.0,749571.0,QM3E21606003,8604.0


## Do some modification on several columns

In [42]:
from sklearn import preprocessing

def process(df):
    
    # Process song length -> minutes
    df.song_length = df.song_length // 60000
    
    # Use only a few columns
    use_columns = ['source_system_tab', 'source_screen_name', \
                   'source_type', 'city', 'bd', 'gender', \
                   'registered_via', 'song_length', 'genre_ids', \
                   'artist_name', 'composer', 'lyricist', \
                   'language', 'ccodes']
    df = df[use_columns]
    
    for column in df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(df[column])
        df[column] = le.transform(df[column])
    
    return df

## Split data into train, test and eval

In [43]:
df     = process(data)
df['target'] = data['target']
# Test
testDF = process(test)

msk = np.random.rand(len(df)) < 0.8
# Train and eval
trainDF = df[msk]
evalDF  = df[~msk]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [44]:
trainDF.head()

Unnamed: 0,source_system_tab,source_screen_name,source_type,city,bd,gender,registered_via,song_length,genre_ids,artist_name,composer,lyricist,language,ccodes,target
0,1,7,6,0,2,2,2,1,286,3278,14582,21155,9,1846,1
1,3,8,4,11,21,0,3,2,91,31961,60954,21155,9,3739,1
3,3,8,4,11,21,0,3,2,7,27440,36701,21155,1,1850,1
5,1,7,6,0,2,2,2,2,91,8147,55580,21155,9,4359,1
6,3,8,4,11,21,0,3,1,345,2903,57876,20136,6,3376,1


# Use GBDT model to train

In [45]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

x = trainDF.drop("target", axis=1)
y = trainDF.target

clf = GradientBoostingClassifier()
clf.fit(x, y)

x_ = evalDF.drop("target", axis=1)
y_ = evalDF.target

y_pred = clf.predict(x_)

accuracy_score(y_, y_pred)

0.63179952904047076

In [46]:
test_pred = clf.predict(testDF)
test = pd.read_csv("./processed/test.csv.gz", compression="gzip")
test['target'] = test_pred
resultDF = test[['id', 'target']]
resultDF = resultDF.sort_values(by='id')
resultDF.to_csv('./processed/my_result_2.csv.gz', compression='gzip', index=False)

In [47]:
resultDF.shape

(2556790, 2)