In [1]:
import numpy as no
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
import pydotplus

In [15]:
# Dataframes
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
members = pd.read_csv('members.csv')
songs = pd.read_csv('songs.csv')
song_extra_info = pd.read_csv('song_extra_info.csv')

In [16]:
# Give each DF labels in order to properly split after concatenating and setting dummy variables
train['label'] = 'train_label'
test['label'] = 'test_label'

# Before getting dummy variables, need to concatenate the dataframes to ensure same number of features in the tree
concat = pd.concat([train, test])

In [17]:
concat.head()

Unnamed: 0,id,label,msno,song_id,source_screen_name,source_system_tab,source_type,target
0,,train_label,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,Explore,explore,online-playlist,1.0
1,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,Local playlist more,my library,local-playlist,1.0
2,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,Local playlist more,my library,local-playlist,1.0
3,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,Local playlist more,my library,local-playlist,1.0
4,,train_label,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,Explore,explore,online-playlist,1.0


In [18]:
# Set dummy variables before merging the dataframes to prevent kernel from dying

# Get dummies for source_system_tab
sst_dummies = pd.get_dummies(concat['source_system_tab'], sparse = True)

# Replace original column with dummy columns
concat.drop('source_system_tab', axis = 1, inplace = True)
concat = pd.concat([concat, sst_dummies], axis = 1)

In [19]:
# Repeat for the rest of the columns in train dataframe
# Could make this a function so there are fewer lines of code
ssn_dummies = pd.get_dummies(concat['source_screen_name'], sparse = True)
concat.drop('source_screen_name', axis = 1, inplace = True)
concat = pd.concat([concat, ssn_dummies], axis = 1)
st_dummies = pd.get_dummies(concat['source_type'], sparse = True)
concat.drop('source_type', axis = 1, inplace = True)
concat = pd.concat([concat, st_dummies], axis = 1)

In [20]:
concat.head()

Unnamed: 0,id,label,msno,song_id,target,discover,explore,listen with,my library,notification,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
0,,train_label,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,1.0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,1.0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,,train_label,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1.0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,,train_label,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9934208 entries, 0 to 2556789
Data columns (total 48 columns):
id                        float64
label                     object
msno                      object
song_id                   object
target                    float64
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment         

In [22]:
# Set dummies for members df
gender_dummies = pd.get_dummies(members['gender'], sparse = True)
members.drop('gender', axis = 1, inplace = True)
members = pd.concat([members, gender_dummies], axis = 1)

In [23]:
members.head()

Unnamed: 0,msno,city,bd,registered_via,registration_init_time,expiration_date,female,male
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,7,20110820,20170920,0,0
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,7,20150628,20170622,0,0
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,4,20160411,20170712,0,0
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,9,20150906,20150907,0,0
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,4,20170126,20170613,0,0


In [24]:
# Set dummies for songs df
# This is going to have a very long runtime bc of the many unique names for artists, composers, lyricist
# Need to optimize time - drop any of these?
genre_dummies = pd.get_dummies(songs['genre_ids'], sparse = True)
songs.drop('genre_ids', axis = 1, inplace = True)
songs = pd.concat([songs, genre_dummies], axis = 1)
#artist_dummies = pd.get_dummies(songs['artist_name'], sparse = True)
songs.drop('artist_name', axis = 1, inplace = True)
#songs = pd.concat([songs, artist_dummies], axis = 1)
#composer_dummies = pd.get_dummies(songs['composer'], sparse = True)
songs.drop('composer', axis = 1, inplace = True)
#songs = pd.concat([songs, composer_dummies], axis = 1)
#lyricist_dummies = pd.get_dummies(songs['lyricist'], sparse = True)
songs.drop('lyricist', axis = 1, inplace = True)
#songs = pd.concat([songs, lyricist_dummies], axis = 1)

In [25]:
songs.head()

Unnamed: 0,song_id,song_length,language,1000,1000|2154|751,1007,1011,1011|2189|367,1011|359,1011|691,...,958|779,958|786,958|947,965,972,979,986,986|1955,993,993|751
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,31.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,31.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,52.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Set dummies for song_extra_info df
# Same problem here, maybe keep these variables out of the decision tree and check for them separately
#name_dummies = pd.get_dummies(songs['name'], sparse = True)
song_extra_info.drop('name', axis = 1, inplace = True)
#song_extra_info = pd.concat([song_extra_info, name_dummies], axis = 1)
#isrc_dummies = pd.get_dummies(songs['isrc'], sparse = True)
song_extra_info.drop('isrc', axis = 1, inplace = True)
#song_extra_info = pd.concat([song_extra_info, isrc_dummies], axis = 1)

In [27]:
song_extra_info.head()

Unnamed: 0,song_id
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=


NameError: name 'concat_df' is not defined

In [28]:
# Merge members, songs, song_extra_info with the concat df (training set and testing set)
# this will be the training set, because it is the first 10000 entries in the concatenated df
# Takes a couple min to run, try to optimize
concat_df = pd.merge(concat[:5000], members, on='msno')
concat_df = pd.merge(concat, songs, on="song_id")
concat_df = pd.merge(concat, song_extra_info, on="song_id")
concat_df.head()


Unnamed: 0,id,label,msno,song_id,target,discover,explore,listen with,my library,notification,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
0,,train_label,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,,train_label,e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1.0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,,train_label,pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,,train_label,sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,,train_label,hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
concat_df.drop('id', axis = 1, inplace = True)
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9931982 entries, 0 to 9931981
Data columns (total 47 columns):
label                     object
msno                      object
song_id                   object
target                    float64
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment                   uint8
People global     

In [32]:
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9931982 entries, 0 to 9931981
Data columns (total 45 columns):
label                     object
target                    float64
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment                   uint8
People global             uint8
People local              uint8
Radio               

In [39]:
# split the concatenated df back into the train and test sets - already split them, just make sure split correctly
train_df = concat_df[concat_df['label'] == 'train_label']
test_df = concat_df[concat_df['label'] == 'test_label']

In [40]:
#test_df = pd.merge(test_df, members, on='msno')
#test_df = pd.merge(test_df, songs, on="song_id")
#test_df = pd.merge(test_df, song_extra_info, on="song_id")
#test_df.head()

In [42]:
#test_df.drop('msno',  axis = 1, inplace = True)
#test_df.drop('song_id',  axis = 1, inplace = True)
test_df.head()

Unnamed: 0,label,target,discover,explore,listen with,my library,notification,null,radio,search,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio.1,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
215,test_label,,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
216,test_label,,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
217,test_label,,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
218,test_label,,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
219,test_label,,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [43]:
#test_df.drop('id', axis = 1, inplace = True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556019 entries, 215 to 9931981
Data columns (total 45 columns):
label                     object
target                    float64
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment                   uint8
People global             uint8
People local              uint8
Radio             

In [44]:
# now drop the temprorary labels
train_df = train_df.drop('label', axis=1)
test_df = test_df.drop('label', axis=1)

In [45]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7375963 entries, 0 to 9611928
Data columns (total 44 columns):
target                    float64
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment                   uint8
People global             uint8
People local              uint8
Radio                     uint8
Search               

In [46]:
# Create the training sets
train_x = train_df.drop('target', 1)
train_y = train_df['target']

In [47]:
# Too many columns now, need to find a new way to do this- specify parameters like max depth or max features?
# Create the model for the decision tree
tree = DecisionTreeClassifier(criterion = "entropy")

In [48]:
test_df.drop('msno', axis = 1, inplace = True)
test_df.drop('song_id', axis = 1, inplace = True)

In [49]:
train_df.head()

Unnamed: 0,target,discover,explore,listen with,my library,notification,null,radio,search,settings,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio.1,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
0,1.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1.0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [51]:
test_df.head()

Unnamed: 0,target,discover,explore,listen with,my library,notification,null,radio,search,settings,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio.1,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
215,,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
216,,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
217,,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
218,,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
219,,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [52]:
#train_x.drop('id', axis = 1, inplace = True)
#train_df.drop('id', axis = 1, inplace = True)
train_y.tail()
#test_df = test_df.sort_values(by='id')

9611924    0.0
9611925    0.0
9611926    0.0
9611927    1.0
9611928    0.0
Name: target, dtype: float64

In [53]:
test_df = test_df.reset_index()

In [54]:
test_df.drop('id', axis = 1, inplace = True)

In [55]:
test_df

Unnamed: 0,index,target,discover,explore,listen with,my library,notification,null,radio,search,...,listen-with,local-library,local-playlist,my-daily-playlist,online-playlist,radio.1,song,song-based-playlist,top-hits-for-artist,topic-article-playlist
0,215,,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,216,,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,217,,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,218,,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,219,,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,220,,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,221,,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,222,,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,223,,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,224,,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [56]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7375963 entries, 0 to 9611928
Data columns (total 43 columns):
discover                  uint8
explore                   uint8
listen with               uint8
my library                uint8
notification              uint8
null                      uint8
radio                     uint8
search                    uint8
settings                  uint8
Album more                uint8
Artist more               uint8
Concert                   uint8
Discover Chart            uint8
Discover Feature          uint8
Discover Genre            uint8
Discover New              uint8
Explore                   uint8
Local playlist more       uint8
My library                uint8
My library_Search         uint8
Online playlist more      uint8
Others profile more       uint8
Payment                   uint8
People global             uint8
People local              uint8
Radio                     uint8
Search                    uint8
Search Home            

In [57]:
# Train the data
tree.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [58]:
test_df.drop('target', axis = 1, inplace = True)

In [59]:
test_df = test_df[:5000]

In [62]:
#test_df.info()
test_df.drop('index', axis = 1, inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 44 columns):
index                     5000 non-null int64
discover                  5000 non-null uint8
explore                   5000 non-null uint8
listen with               5000 non-null uint8
my library                5000 non-null uint8
notification              5000 non-null uint8
null                      5000 non-null uint8
radio                     5000 non-null uint8
search                    5000 non-null uint8
settings                  5000 non-null uint8
Album more                5000 non-null uint8
Artist more               5000 non-null uint8
Concert                   5000 non-null uint8
Discover Chart            5000 non-null uint8
Discover Feature          5000 non-null uint8
Discover Genre            5000 non-null uint8
Discover New              5000 non-null uint8
Explore                   5000 non-null uint8
Local playlist more       5000 non-null uint8
My library         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [63]:
# Make predictions based on the testing set
tree_predictions = tree.predict(test_df)

In [64]:
# Analyze the gain for each attribute
pd.DataFrame({'Gain': tree.feature_importances_}, index = train_x.columns).sort_values('Gain', ascending = False)

Unnamed: 0,Gain
Local playlist more,0.702287
Radio,0.135744
local-library,0.044172
local-playlist,0.030294
Discover Chart,0.017138
my library,0.015095
Others profile more,0.01358
Search,0.00783
discover,0.004069
Discover Feature,0.00362


In [65]:
submission = pd.DataFrame(data=tree_predictions)

In [66]:
submission

Unnamed: 0,0
0,1.0
1,0.0
2,0.0
3,1.0
4,1.0
5,1.0
6,0.0
7,0.0
8,1.0
9,1.0


In [67]:
submission.to_csv("music_predictions.csv", sep = '\t')

In [None]:
# Problems that need to be fixed
# 1. dropped important columns like artist... somehow need to account for that
# 2. in the test_df, we dropped the id, which we need for the final submission... need to keep that- try just dropping it for the train_df
# 3. need to organize this and can save a lot of space by compressing the code to fewer lines
# 4. only using a small subset of the data for the training and testing sets
# 5. parsed the training, but not the testing set... now testing is larger than training