## Create Train and Test Split
Testing of the Spotify Recommender will be done by predicting missing songs in a playlist.

A set of 10,000 test playlists are selected where each playlist has at least 50 tracks.

From each of the test playlists, 10 songs will be withheld so that the test dataset contains two datasets; 1 for the 'given' tracks of a playlist, and another one for the 'withheld' tracks of a playlist.

A train dataset is created which excludes all the test playlists.

The goal of a recommender is to predict the 'withheld' tracks when the 'given' tracks are provided.

In [1]:
import os, sys
import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy import func #Table, Column, Integer, String, Float, MetaData, and_, or_, 

sys.path.append('../../')
from spotify_api import get_spotify_data, get_tracks, get_artists, get_audiofeatures
from spotify_database import get_session, display_time
from spotify_utils import Table_Generator, List_Generator, pickle_load, pickle_save

from tqdm import tqdm_notebook as tqdm

In [2]:
num_test_playlists  = 10000
num_withheld_tracks = 10
min_playlist_length = 50

if num_withheld_tracks>=min_playlist_length:
    print("WARNING: Unpredictable results when the number of withheld tracks exceeds the minimum playlist length.")

data_path = '../../data/SpotifyDataSet'
db_path = '../../data/SpotifyDataSet/spotify_songs.db'

# Get sesion
session = get_session(db_path)
engine = create_engine('sqlite:///' + db_path)

# Get Songs class
Playlists = getattr(get_session, "Playlists")
Artists = getattr(get_session, "Artists")
Tracks = getattr(get_session, "Tracks")

### Train/Test Creation Process
1. Get a list of all playlists and their length
2. Define a list of test playlist ID's where each playlist has more than the required minimum playlist length
3. Define a list of train playlist ID's that includes all playlists except this designated as the test list
4. Split the test playlist into 'given' and a 'withheld' groups.

#### Step 1 - get list of all playlists and their length

In [3]:
def create_counts_csv(data_path, filename:str="df_counts.csv")->str:
    """
    Create a CSV file that includes a list of all playlist ids from 
    the database and the count of tracks in each playlist.
    If the given filename already exists, create a new file with 
    a number extension.
    Returns the name of saved file without the path.
    """
    # see if directory exists
    if not os.path.isdir(data_path):
        print ("'{}' is not a directory.".format(data_path))
        print ("Provide a valid directory path and rerun function.")
        return False
    
    # see if file already exists
    if os.path.isfile(os.path.join(data_path, filename)):
        f_split = filename.split(".") # split extension from filename
        filename_stem = "_".join([f_split[-2],"1"]) # add a _1 to filename
        f_split[-2] = filename_stem # replace the stem
        filename = ".".join(f_split) #rejoin
    
    print("Creating counts csv file: {}".format(filename))
    
    # find all playlists with more than 50 tracks
    # Count Number of tracks in playlists
    songs_per_playlist = display_time(session.query(Playlists.playlist_id, 
                                    func.count(Playlists.track_name).label('count')).group_by(Playlists.playlist_id).all)

    # Cast to dataframe
    df_counts = pd.DataFrame(data=songs_per_playlist, 
                             columns=["playlist_id","song_count"]).set_index('playlist_id', drop=True)

    df_counts.to_csv(os.path.join(data_path, filename))
    
    print("Saved csv file: {}".format(os.path.join(data_path, filename)))
    
    return filename
    

In [4]:
# create the counts csv file
# filename = create_counts_csv(data_path, filename="df_counts.csv")

# read in file just saved
df_counts = pd.read_csv(os.path.join(data_path, "df_counts.csv"))



#### Step 2 and 3 - define train and test playlists

In [5]:
# get the playlists id's where the playlist has >=50 songs
np.random.seed(1)
playlistids_gt_minlen = df_counts[df_counts.song_count>=min_playlist_length].playlist_id.values
test_playlistids  = np.random.choice(playlistids_gt_minlen, num_test_playlists, replace=False)
train_playlistids = list(set(df_counts.playlist_id.values)) #- set(test_playlistids))

In [6]:

# get all playlists from the db - 30 minutes
playlist_db = display_time(session.query(Playlists.playlist_id, Playlists.track_uri, Playlists.track_name, Playlists.artist_uri, Playlists.artist_name).all) 

# Cast to dataframe
# takes 10 minutes
df_playlists = pd.DataFrame(data=playlist_db, 
                         columns=["playlist_id","track_uri","track_name","artist_uri","artist_name"])

# takes 5 minutes - 8gb file
df_playlists.to_csv(os.path.join(data_path, "df_playlists.csv"))

Time to Execute: 1654.25 seconds


In [7]:
# create train dataframe
df_playlists_train = df_playlists[df_playlists.playlist_id.isin(train_playlistids)]

# create test dataframe
df_playlists_test = df_playlists[df_playlists.playlist_id.isin(test_playlistids)]

#### Step 4 - split test set into 'given' and 'withheld' datasets

In [8]:
# break test dataframe into 'given' and 'withheld' groups
# takes 20 minutes
df_playlists_test_withheld = pd.DataFrame()
df_playlists_test_given = pd.DataFrame()

# iterate over selected test playlists
# select random 10 tracks from each and put them in 'withheld' dataframe
# keep remaining in a 'given' dataframe
seed = np.random.RandomState(seed=2)

test_playlist_ids = np.unique(df_playlists_test.playlist_id.values)
for playlist_id in tqdm(test_playlist_ids, desc="split test into given and withheld"):
    plist = df_playlists_test[df_playlists_test.playlist_id==playlist_id]
    
    withheld_tracks = plist.sample(num_withheld_tracks, random_state=seed, replace=False, axis=0)
    given_tracks_indexes = list(set(plist.index.values) - set(withheld_tracks.index.values))
    given_tracks = plist[plist.index.isin(given_tracks_indexes)]
    
    # create entries in withheld and given df's
    df_playlists_test_withheld = df_playlists_test_withheld.append(withheld_tracks)
    df_playlists_test_given = df_playlists_test_given.append(given_tracks)

HBox(children=(IntProgress(value=0, description='split test into given and withheld', max=10000, style=Progres…




In [9]:
# Save datasets to CSV files
df_playlists_test_withheld.to_csv(os.path.join(data_path, "df_playlists_test_withheld.csv"), index_label='index')
df_playlists_test_given.to_csv(os.path.join(data_path, "df_playlists_test_given.csv"), index_label='index')
df_playlists_train.to_csv(os.path.join(data_path, "df_playlists_train.csv"), index_label='index')

## To read in saved Train and Test Datasets

In [10]:
# read respective CSV files as dataframes
df_playlists_test_withheld = pd.read_csv(os.path.join(data_path, "df_playlists_test_withheld.csv"), index_col='index')
df_playlists_test_given    = pd.read_csv(os.path.join(data_path, "df_playlists_test_given.csv"), index_col='index')
df_playlists_train         = pd.read_csv(os.path.join(data_path, "df_playlists_train.csv"), index_col='index')

  mask |= (ar1 == a)
