In [None]:
# putting info into csv file (song title,song artist,mood1,mood2,...)

# for every file in millionsongsubset, we need to get the songs
# for every song, we need to webscrape allmusic to find the moods

# can download a subset by using wget and a url from the million song database website

In [2]:
# imports
import tarfile
import os
import hdf5_getters
import numpy as np
import csv
import allmusic_scraping
from bs4 import BeautifulSoup
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from seleniumbase import Driver
from selenium.common.exceptions import TimeoutException



In [2]:
# extract msd data from zip file
def extract_tar_gz(file_path, extract_path):
    # Open the tar.gz file
    with tarfile.open(file_path, 'r:gz') as tar:
        # Extract all files
        tar.extractall(path=extract_path)

tar_gz_file = 'millionsongsubset.tar.gz'
extract_directory = 'millionsongsubset'

# extract directory exists
os.makedirs(extract_directory, exist_ok=True)

# Extract the tar.gz file
extract_tar_gz(tar_gz_file, extract_directory)

# after extracting the file, flatten the directories with this terminal call:
# $ find coms4995-appliedcv/millionsongsubset/MillionSongSubset -type f -exec mv -i '{}' ./coms4995-appliedcv/millionsongsubset \;

# now remove the folder with the redundant files
# $ cd coms4995-appliedcv/millionsongsubset/
# $ rm -r MillionSongSubset/

In [3]:
# write to a csv file (song title,song artist,mood1,mood2,...)
def read_moods(file_path):
    with open(file_path, 'r') as file:
        moods = []
        moods_dict = {}
        count = 0
        for line in file:
            m = line.strip().lower()
            moods.append(m)
            moods_dict[m] = count
            count += 1
        return moods, moods_dict
        #return [line.strip() for line in file]

# moods.txt from https://github.com/fdlm/listening-moods/blob/master/data/moods.txt
moods_file = 'moods.txt'

# Read moods from file
moods, moods_dict = read_moods(moods_file)

# Headers contain song title, song artist, and moods
headers = ['song_title', 'song_artist'] + moods

In [4]:
seen_songs = set()


In [5]:
data = []

In [10]:
# Directory containing the .h5 files
directory = './millionsongsubset'

num_moods = len(moods)
count = 0

# List all files in the directory
for filename in os.listdir(directory):
    # Check if the file is an .h5 file
    if filename.endswith('.h5'):
        # Process the .h5 file
        file_path = os.path.join(directory, filename)

        # process the .h5 file
        h5 = hdf5_getters.open_h5_file_read(file_path)
        num_songs = hdf5_getters.get_num_songs(h5)
        for i in range(num_songs):
            count += 1
            if count % 100 == 0:
                print("finished songs:", count)
            song_data = []
            song_title = hdf5_getters.get_title(h5,i)
            song_title = song_title.decode('utf-8').lower()
            
            try:
                parentheses = song_title.index('(')
            except ValueError:
                parentheses = -1
            
            if parentheses != -1:
                song_title = song_title[:parentheses]
                
            song_artist = hdf5_getters.get_artist_name(h5,i)
            song_artist = song_artist.decode('utf-8').lower()
            
            song_data.append(song_title)
            song_data.append(song_artist)
            
            if (song_title,song_artist) in seen_songs:
                continue
            seen_songs.add((song_title,song_artist))
            
            
            song_data += [0]*num_moods
            
            # song_search_matching
            # https://github.com/jack-arms/allmusic-python/blob/master/allmusic.py
            # query: chart_song.title + ' ' + main_artist

            allmusic_song = allmusic_scraping.song_search(song_title + ' ' + song_artist, 3)

            # find mood for the song
            # check if there was an error during the search
            if isinstance(allmusic_song, dict) and 'error' in allmusic_song:
                print("Error")
                continue # throw error?

            # check if any search results were found
            if not allmusic_song:
                print("No search results found for the given query.")
                continue # throw error?

            top_result = allmusic_song[0]

            # get html url
            if top_result is not None:
                html_page_url = top_result['title']['url'] + "#moodsThemes"

            # fetch html
            if html_page_url:
                print(html_page_url)
                html_page = requests.get(html_page_url)
                html_page = requests.get(html_page_url, headers={
                    'Host': 'www.allmusic.com',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, sdch',
                    'Accept-Language': 'en-US,en;q=0.8'
                })
                driver = Driver(browser="chrome", headless=False)
                
                try:
                    driver.get(html_page_url)
                except Exception:
                    continue
                
                try:
                    try:
                        scraped_moods = WebDriverWait(driver, 2).until(
                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div#moodsGrid > div > a'))
                        )
                    except TimeoutException:
                        print("didn't find moods")
                        continue
                    
                    mood_words = [m.text[:m.text.index(' ')].lower() for m in scraped_moods]
                    print(mood_words)
                    # go through mood_words and one hot encode it in data
                    for mood in mood_words:
                        if mood in moods_dict:
                            song_data[moods_dict[mood]+2] = 1
                    
                finally:
                    driver.quit()

            else:
                print("No URL found for the top search result.")
            
            data.append(song_data)
            
        h5.close()
        

finished songs: 100
finished songs: 200
finished songs: 300
finished songs: 400
finished songs: 500
finished songs: 600
finished songs: 700
finished songs: 800
finished songs: 900
finished songs: 1000
finished songs: 1100
inside scraper old friends simon & garfunkel
https://www.allmusic.com/song/old-friends-mt0032283994#moodsThemes
['autumnal', 'bittersweet', 'delicate', 'literate', 'poignant', 'restrained', 'melancholy', 'earnest', 'searching', 'wistful', 'yearning']
inside scraper alli mia fora peggy zina
https://www.allmusic.com/song/alli-mia-fora-mt0029357269#moodsThemes
didn't find moods
inside scraper almost genuine 9th wonder featuring defcon_ phonte
No search results found for the given query.
inside scraper primordial breath eluveitie
https://www.allmusic.com/song/primordial-breath-mt0033067898#moodsThemes
['aggressive', 'lively']
inside scraper enclosed_ one broken heart eddy arnold
https://www.allmusic.com/song/one-broken-heart-for-sale-mt0000961927#moodsThemes
['brooding']


TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [8]:
def write_csv(file_path, data, headers):
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
        for row in data:
            writer.writerow(row)

In [7]:
len(data)

236

In [9]:
song_csv = 'song_moods.csv'

# Write to CSV
write_csv(song_csv, data, headers)

In [None]:
# go through all the files in /millionsongsubset
# for each file, find the songs in allmusic.com (id="moodsGrid")

In [None]:
# for each song in msd, webscrape allmusic for moods

In [None]:
# find spotify id for song

In [None]:
# store in dictionary {mood: [spotify id]}