In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
import datetime
import numpy as np

### Read Data

In [2]:
data = pd.read_csv("./lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv", 
                   delimiter="\t", header=None, 
                   names = ["userid","timestamp","musicbrainz-artist-id",
                            "artist-name","musicbrainz-track-id","track-name"])


In [3]:
user_profiles = pd.read_csv("./lastfm-dataset-1K/userid-profile.tsv", delimiter = "\t")
user_profiles.columns = ["userid", "gender","age","country","registered"]
user_profiles.drop(["country", "registered"], axis=1, inplace=True)

### Make timestamps, days of week, hours. Sort by time and users

In [4]:
data = data.rename(columns={0:'userid', 1:'timestamp', 2:'artistid', 3:'artistname', 4:'trackid', 5:'trackname'})
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [5]:
data['weekday'] = data['timestamp'].dt.weekday_name.astype('category',
                                                           categories=["Sunday","Monday","Tuesday","Wednesday",
                                                                       "Thursday","Friday","Saturday"],
                                                           ordered=True)
data['hour'] = data['timestamp'].dt.hour


In [6]:
data = data.sort_values(['userid', 'timestamp'], ascending=[True, True])

### Time of day (morning, noon etc)

In [7]:
data['weekend'] = 0 
data.loc[data['weekday'].isin(['Saturday', 'Sunday']), 'weekend'] = 1

# data['timestamp'] = pd.to_datetime(data['timestamp'])
# data_user['hour'] = data_user['timestamp'].dt.hour

data['daytime'] = 0
data.loc[(data['hour'] >= 0) & (data['hour'] <= 3), 'daytime'] = 0
data.loc[(data['hour'] > 3) & (data['hour'] <= 5), 'daytime'] = 1
data.loc[(data['hour'] > 5) & (data['hour'] <= 11), 'daytime'] = 2
data.loc[(data['hour'] > 11) & (data['hour'] <= 19), 'daytime'] = 3
data.loc[(data['hour'] > 19) & (data['hour'] <= 23), 'daytime'] = 4

### count of how many times an SONG was listened to in given day and time

In [8]:
data['counted'] = 1

#data_user_counted = data.groupby(['userid','track-name', 'daytime', 'weekend'])['counted'].count().reset_index()
#data_user_counted.columns = ['userid','track-name', 'daytime', 'weekend', 'track-weekday-daytime-count']

data['user-track-total-count'] = data.groupby(['userid','track-name']).cumcount() + 1
data['track-weekday-count'] = data.groupby(['userid','track-name', 'weekend']).cumcount() + 1
data['track-daytime-count'] = data.groupby(['userid','track-name', 'daytime']).cumcount() + 1

#data = pd.merge(data, data_user_counted, on = ['userid','track-name', 'daytime', 'weekend'], how = 'left')

### Determine how long a song was played and when was it last played

In [9]:
data['songlength'] = data.groupby('userid')['timestamp'].diff()
data['last-seen-song'] = data.groupby(['track-name'])['timestamp'].diff()
data['last-seen-song'] = data['last-seen-song'].dt.total_seconds() / (24 * 60 * 60)

### Quarters of the year

In [10]:
data['month'] = data['timestamp'].dt.month
data['quarter'] = 1
data.loc[(data['month'] > 3) & (data['month'] <= 6), 'quarter'] = 2
data.loc[(data['month'] > 6) & (data['month'] <= 9), 'quarter'] = 3
data.loc[(data['month'] > 9) & (data['month'] <= 12), 'quarter'] = 4

## SKIPS variable - The independent variable

In [11]:
data['skipped'] = (data.songlength  < datetime.timedelta(minutes=1)).astype('int')

### count of how many times an ARTIST was listened to in given day and time

In [12]:
#data_user_counted = data.groupby(['userid','artist-name', 'daytime', 'weekend'])['counted'].count().reset_index()
#data_user_counted.columns = ['userid','artist-name', 'daytime', 'weekend', 'artist-weekday-daytime-count']

data['user-artist-count'] = data.groupby(['userid','artist-name']).cumcount() + 1
data['user-artist-weekday-daytime-count'] = data.groupby(['userid','artist-name', 'daytime', 'weekend']).cumcount() + 1

#data = pd.merge(data, data_user_counted, on = ['userid','artist-name', 'daytime', 'weekend'], how = 'left')

### Determine when an artist was last played

In [13]:
data['last-seen-artist'] = data.groupby(['artist-name'])['timestamp'].diff()
data['last-seen-artist'] = data['last-seen-artist'].dt.total_seconds() / (24 * 60 * 60)

### Merge user metadata

In [14]:
data = pd.merge(data, user_profiles, on = ['userid'], how = 'left')

In [15]:
data.drop(["musicbrainz-artist-id", "musicbrainz-track-id","counted"], axis=1, inplace=True)

### Convert days into numerical values (Mon = 0, Sun = 6)

In [16]:
data['weekday'] = data['timestamp'].dt.dayofweek

### Does the user skip songs at all?

In [17]:
data = data.sort_values(['userid', 'timestamp'], ascending=[True, True])
data['user-song-skips'] = data.groupby(['userid', 'track-name'])['skipped'].cumsum()
data['user-song-skip-percentage'] = data['user-song-skips']/data['user-track-total-count']
data['user-artist-skips'] = data.groupby(['userid', 'artist-name'])['skipped'].cumsum()
data['user-artist-skip-percentage'] = data['user-artist-skips']/data['user-artist-count']


### Song popularities (based on skips)

In [19]:
data['global-song-skips'] = data.groupby(['track-name'])['skipped'].cumsum()
data['global-artist-skips'] = data.groupby(['artist-name'])['skipped'].cumsum()

data['artist_total_count'] = data.groupby(['artist-name']).cumcount() + 1
data['song_total_count'] = data.groupby(['track-name']).cumcount() + 1

data['global-song-skip-percentage'] = data['global-song-skips']/data['song_total_count']
data['global-artist-skip-percentage'] = data['global-artist-skips']/data['song_total_count']


### Pick top 1000 song for each user

In [20]:
songs = data.groupby(['userid', 'track-name', 'artist-name']).size().reset_index(name="count")


In [21]:
top1000 = songs.groupby(['userid'])['track-name','artist-name','count'].apply(lambda grp: grp.nlargest(1000, 'count')).reset_index()


In [22]:
top1000.head()

Unnamed: 0,userid,level_1,track-name,artist-name,count
0,user_000001,1654,Music,Cornelius,70
1,user_000001,998,Gum,Cornelius,63
2,user_000001,1525,Mario Basanov & Vidis ‘Test’,Gilles Peterson,52
3,user_000001,441,Child Song,The Cinematic Orchestra,45
4,user_000001,1051,Hibari,坂本龍一,42


In [23]:
top1000.drop(["level_1", "count"], axis=1, inplace=True)
data_subset = pd.merge(top1000, data, on = ['userid','track-name', 'artist-name'], how = 'left')

In [24]:
print(np.shape(data))
print(np.shape(data_subset))

(19098862, 31)
(10543035, 31)


In [25]:
data_subset = data_subset.sort_values(['userid', 'timestamp'], ascending=[True, True])

In [26]:
data_subset.head(5)

Unnamed: 0,userid,track-name,artist-name,timestamp,weekday,hour,weekend,daytime,user-track-total-count,track-weekday-count,...,user-song-skips,user-song-skip-percentage,user-artist-skips,user-artist-skip-percentage,global-song-skips,global-artist-skips,artist_total_count,song_total_count,global-song-skip-percentage,global-artist-skip-percentage
8512,user_000001,The Launching Of Big Face,Plaid & Bob Jaroc,2006-08-13 13:59:20,6,13,1,3,1,1,...,0,0.0,0,0.0,0,0,1,1,0.0,0.0
7890,user_000001,Zn Zero,Plaid & Bob Jaroc,2006-08-13 14:03:29,6,14,1,3,1,1,...,0,0.0,0,0.0,0,0,2,1,0.0,0.0
3782,user_000001,The Return Of Super Barrio - End Credits,Plaid & Bob Jaroc,2006-08-13 14:10:43,6,14,1,3,1,1,...,0,0.0,0,0.0,0,0,3,1,0.0,0.0
3101,user_000001,Dayvan Cowboy,Boards Of Canada,2006-08-13 15:44:17,6,15,1,3,1,1,...,0,0.0,0,0.0,0,0,1,1,0.0,0.0
5295,user_000001,A Moment Of Clarity,Boards Of Canada,2006-08-13 16:46:52,6,16,1,3,1,1,...,1,1.0,1,0.333333,1,1,3,1,1.0,1.0


### Write to CSV

In [27]:
data_subset.to_csv("data_engineered_features.csv", index=False)

### Also do just top 500?

In [28]:
top500 = songs.groupby(['userid'])['track-name','artist-name','count'].apply(lambda grp: grp.nlargest(500, 'count')).reset_index()

In [29]:
top500.drop(["level_1", "count"], axis=1, inplace=True)
data_subset = pd.merge(top500, data, on = ['userid','track-name', 'artist-name'], how = 'left')
data_subset = data_subset.sort_values(['userid', 'timestamp'], ascending=[True, True])
data_subset.to_csv("data_engineered_features500.csv", index=False)

In [30]:
np.shape(top500)

(468483, 3)