In [8]:
import sys
import sqlite3
import time

import unicodedata

# Assuming that you have the database (or a sample of it) in this folder
db_file = 'Sample_Song_Dataset.db'

def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [9]:
with sqlite3.connect(db_file) as conn:
    
    # Choose 20 artists (arbitrarily, we can always change the number) based on artist id
    # Note that it does not need to be ordered at all, this was just a way to get somewhat cleaner names 
    # Other specifications to consider: only taking nonzero years? 
    cursor = conn.cursor()
    cursor.execute('SELECT DISTINCT artist_id, artist_name FROM artists ORDER BY artist_id DESC LIMIT 20')
    
    distinct_artists = cursor.fetchall()
    all_data = []
    
    for artist in distinct_artists:
        
        artist_id = (artist[0],)
        
        # For each artist, get artist names, track titles, years, genres 
        cursor.execute('SELECT artist_name, title, year, term FROM artists INNER JOIN tracks ON artists.artist_id=tracks.artist_id INNER JOIN artist_term ON artists.artist_id=artist_term.artist_id WHERE artists.artist_id = ?', artist_id)
        
        # Return the first 20 tuples (arbitrarily, we can always change this)
        all_data = all_data + cursor.fetchall()[:20]


In [10]:
# Strip accents, replace spaces with _, remove apostrophes, lowercase
# Not sure if removing accents/apostrophes is strictly necessary, but worth testing
# Other things to consider removing: parentheses, question marks, etc. 
with open('songs_data.txt', 'w') as myfile:
    for song in all_data:
        
        artist = strip_accents(song[0].replace(' ','_').replace("'","").lower())
        track = strip_accents(song[1].replace(' ','_').replace("'","").lower())
        year = song[2]
        genre = song[3].replace(' ', '_')
        myfile.write('by_artist_song\ten.artist.{}\ten.song.{}\n'.format(artist,track))
        myfile.write('song_year\ten.song.{}\t(date {} -1 -1)\n'.format(track,year))
        myfile.write('artist_genre\ten.artist.{}\ten.genre.{}\n'.format(artist,genre))
        