# My DJ RL Collaborative Filtering Preprocessing
This notebook implements the algorithm proposed in "RLCF: A Collaborative Filtering Approach Based on Reinforcement Learning With Sequential Ratings" on the Last.FM dataset. http://mllab.sogang.ac.kr/publications/RLCF.pdf


In [80]:
import numpy as np
import pandas as pd
import csv

In [81]:
from io import StringIO

def load_data():
    data_file = "lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"
    return pd.read_csv(data_file,
                       delimiter="\t", 
                       names=["userid", "timestamp", "artist-id", "artist-name", "track-id", "track-name"])

## Load a subset of the Last.fm user rating data

In [82]:
num_rows = 1000
fm_df = load_data()
fm_df = fm_df.sample(num_rows, replace=False)
fm_df

KeyboardInterrupt: 

## Parse Timestamps of User --> Artist listenings
We will transform these timestamps into timesteps for our MDP formulation. The earliest timestamp associated with a user will be labelled as timestep 0 and so on. A user listens to a certain artist at each timestep. 

In [156]:
from dateutil import parser

# Collect the timestamps of all the times a certain user listened to any song
user_time_dict = {}
for index, row in fm_df.iterrows():
    if row['userid'] not in user_time_dict:
            user_time_dict[row['userid']] = []
    user_time_dict[row['userid']].append((index,parser.parse(row['timestamp']),row['userid'],row['artist-name']))

user_time_dict

{'user_000002': [(27490,
   datetime.datetime(2008, 5, 10, 16, 29, 2, tzinfo=tzutc()),
   'user_000002',
   'Damien Rice'),
  (37600,
   datetime.datetime(2007, 5, 31, 8, 43, 52, tzinfo=tzutc()),
   'user_000002',
   'Die Ärzte'),
  (18603,
   datetime.datetime(2009, 2, 5, 18, 34, 4, tzinfo=tzutc()),
   'user_000002',
   'Simon & Garfunkel'),
  (52479,
   datetime.datetime(2006, 11, 25, 6, 5, 48, tzinfo=tzutc()),
   'user_000002',
   'Coheed And Cambria')],
 'user_000003': [(80255,
   datetime.datetime(2008, 1, 24, 2, 44, 31, tzinfo=tzutc()),
   'user_000003',
   'Handsome Boy Modeling School')],
 'user_000006': [(156507,
   datetime.datetime(2006, 10, 6, 14, 29, 58, tzinfo=tzutc()),
   'user_000006',
   'Bruno Pronsato')],
 'user_000008': [(191338,
   datetime.datetime(2009, 1, 16, 2, 49, 13, tzinfo=tzutc()),
   'user_000008',
   'Kanye West')],
 'user_000012': [(237833,
   datetime.datetime(2008, 12, 2, 22, 31, 37, tzinfo=tzutc()),
   'user_000012',
   'Animal Collective'),
  (266719

### Sort the list of artists that users listened to by their timestamps

In [157]:
user_time_dict = dict(map(lambda x: (x[0], sorted(x[1], key=lambda y: y[1])), user_time_dict.items()))
user_time_dict

{'user_000002': [(52479,
   datetime.datetime(2006, 11, 25, 6, 5, 48, tzinfo=tzutc()),
   'user_000002',
   'Coheed And Cambria'),
  (37600,
   datetime.datetime(2007, 5, 31, 8, 43, 52, tzinfo=tzutc()),
   'user_000002',
   'Die Ärzte'),
  (27490,
   datetime.datetime(2008, 5, 10, 16, 29, 2, tzinfo=tzutc()),
   'user_000002',
   'Damien Rice'),
  (18603,
   datetime.datetime(2009, 2, 5, 18, 34, 4, tzinfo=tzutc()),
   'user_000002',
   'Simon & Garfunkel')],
 'user_000003': [(80255,
   datetime.datetime(2008, 1, 24, 2, 44, 31, tzinfo=tzutc()),
   'user_000003',
   'Handsome Boy Modeling School')],
 'user_000006': [(156507,
   datetime.datetime(2006, 10, 6, 14, 29, 58, tzinfo=tzutc()),
   'user_000006',
   'Bruno Pronsato')],
 'user_000008': [(191338,
   datetime.datetime(2009, 1, 16, 2, 49, 13, tzinfo=tzutc()),
   'user_000008',
   'Kanye West')],
 'user_000012': [(266719,
   datetime.datetime(2007, 7, 20, 19, 49, 8, tzinfo=tzutc()),
   'user_000012',
   'Interpol'),
  (252514,
   datet

### Create a mini data frame with the timestamp reduced to a positive integer

In [158]:
indices = []
d = {'timestep':[]}

for user, info in user_time_dict.items():
    for i,record in enumerate(info):
        d['timestep'].append(i)
        indices.append(record[0])
time_df = pd.DataFrame(data=d, index=indices)
time_df

Unnamed: 0,timestep
2202409,0
2202245,1
10443481,0
10435806,1
10435222,2
10425085,3
10424634,4
10357363,0
10344080,1
10326204,2


In [159]:
fm_mdp = pd.merge(fm_df, time_df, left_index=True, right_index=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(fm_mdp)

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name,timestep
2202409,user_000110,2008-07-20T16:20:23Z,fd85cfd1-597b-49c6-962e-ea8350b0466d,Afro Kolektyw,294f600d-3ecc-4b4d-9a52-ac3377b0767f,Gramy Dalej,0
10443481,user_000552,2007-02-02T04:26:50Z,5bae9c05-fabd-420f-b806-c5b8ba88bd7e,Rbd,a048ea50-d785-4818-8e53-c9d672f473f1,Celestial,0
10322452,user_000544,2007-05-05T08:34:04Z,74134743-106c-47a6-bb8d-5c9efde7818c,Mstrkrft,252a8358-2077-4e55-99f0-872dea2c08a9,The Looks,3
4465401,user_000233,2007-06-16T23:28:46Z,eb28b766-56db-4644-85c9-487f27df47ed,Supafly,6c97df06-5b21-4b20-9637-f2cd03636812,Moving Too Fast (Freemasons Full Vocal Mix),3
1336969,user_000067,2009-02-04T21:04:55Z,90218af4-4d58-4821-8d41-2ee295ebbe21,Kaiser Chiefs,1e8857d3-55ad-417c-8370-ff3e45bd2255,I Predict A Riot,0
5678525,user_000293,2006-05-16T09:53:44Z,fbd86487-ccb5-4a57-a860-cc3d360b5115,Starsailor,ff1ee1d4-86cc-4d5d-8f42-db124aa55aee,Way To Fall,2
17843938,user_000932,2009-01-17T13:02:47Z,5ffb73c0-b739-4231-ba22-f8d361be1d45,菅野よう子,,Forever Broke,0
14234557,user_000758,2006-02-28T01:42:08Z,e7a8e086-16df-44a7-91e4-640986bcff72,Gackt,7f8eda0d-67d9-4605-9ba8-cbddccb22ea9,Kimi Ni Aitakute,0
10062793,user_000536,2008-05-26T00:38:50Z,86c20d59-4cd7-4814-b6b7-e371c1016097,The Spinto Band,151f29bf-deff-4284-b767-a49938bb6d50,Trust Vs. Mistrust,0
469501,user_000021,2006-03-22T23:10:19Z,0f96c38b-8e21-4571-afe0-cb11196a8acd,Gossip,7c60a521-8465-4d2d-b983-1734e4d9d93d,Keeping You Alive,2


## Generate user-song matrix
Taken from Abe's Last.FM Preprocessing for recommender system notebook

In [160]:
from sortedcontainers import SortedList

user_order = SortedList(sorted(set(fm_mdp['userid'])))
artist_order = SortedList(sorted(set(fm_mdp['artist-name'])))
user_song_matrix = np.zeros((len(user_order), len(artist_order)))
print(user_song_matrix.shape)

(474, 829)


In [161]:
user_artist_df = fm_mdp.loc[:, ['userid','artist-name']]
user_artist_df = user_artist_df.drop_duplicates()
print(user_artist_df.head())

               userid    artist-name
2202409   user_000110  Afro Kolektyw
10443481  user_000552            Rbd
10322452  user_000544       Mstrkrft
4465401   user_000233        Supafly
1336969   user_000067  Kaiser Chiefs


In [162]:
for i, row in enumerate(user_artist_df.iterrows()):
    user_id, artist_name = row[1]

    row_num = user_order.index(user_id)
    col_num = artist_order.index(artist_name)
    user_song_matrix[row_num, col_num] = 1
    if i % 100000 == 0:
        print(i)

0


In [163]:
with open('user-id-to-listened-artist-name-matrix-RL.csv', 'w', encoding='utf-8') as outfile:
    df = pd.DataFrame(user_song_matrix.astype(int))
    column_names = ["userid"] + list(artist_order)
    df.insert(0, "userid", user_order)
    df.columns = column_names
    df.to_csv(outfile, sep=',', encoding='utf-8')

## Generate RL Episodic Information

In [167]:
episodes = [[] for i in range(len(user_order))]
for user, listens in user_time_dict.items():
    for listen in listens:
        episodes[user_order.index(listen[2])].append((listen[3],artist_order.index(listen[3])))
episodes

[[('Coheed And Cambria', 151),
  ('Die Ärzte', 182),
  ('Damien Rice', 162),
  ('Simon & Garfunkel', 616)],
 [('Handsome Boy Modeling School', 294)],
 [('Bruno Pronsato', 118)],
 [('Kanye West', 360)],
 [('Interpol', 326),
  ('Jason Collett', 334),
  ('Matthew Good Band', 433),
  ('Animal Collective', 38),
  ('Little Boots', 396)],
 [('Caesars', 124)],
 [('Matt Costa', 432), ('Azam Ali', 58)],
 [('Muse', 476), ('A Perfect Circle', 9), ('Fiona Apple', 243)],
 [('Turpentine Brothers', 779),
  ('Queens Of The Stone Age', 557),
  ('Gossip', 284),
  ('Scared Of Chaka', 595),
  ('Dillinger Four', 183),
  ('Afi', 15)],
 [('The Owls', 726),
  ('Benjamin Wetherill', 79),
  ('The Capstan Shafts', 686),
  ('Afternoon Naps', 17),
  ('Fanfarlo', 234)],
 [('Massive Attack', 429)],
 [('God Is An Astronaut', 274)],
 [('Telefon Tel Aviv', 665)],
 [('Mates Of State', 431), ('The Smashing Pumpkins', 740), ('Ween', 796)],
 [('The Pist', 729)],
 [('Dir En Grey', 184),
  ('Xavier Naidoo', 804),
  ('清春', 826

In [168]:
import pickle

with open("episodes", "wb") as file:
    pickle.dump(episodes,file)