In [None]:
!pip install requests_html
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import asyncio

### Start asynchronous HTML session

In [None]:
asession = AsyncHTMLSession() # Start asynchronous HTML session

### Fetch a webpage using new session

In [None]:
"""
Asynchronous function is used to fetch a webpage using new session with retry logic
Parameters:
    - url
    - retries (Number of attempts to try in case of errors; default is 3)
"""
async def fetch_with_new_session(url, retries=3):
    for attempt in range(retries):
        try:
            response = await asession.get(url)
            await response.html.arender(timeout=30000) # added to increase the default 8 seconds
            return response.html.html
        except Exception as e:
            print(f"attempt {attempt + 1} failed {e}")
            if attempt < retries - 1:
                await asyncio.sleep(2)
            else:
                print(f"all attempts failes for {url}")
                raise e
    await asession.close()

### Get page URL

In [None]:
"""
Asynchronous function to get page url
"""
async def get_page(url):
    return await fetch_with_new_session(url)

### Get all albums from artist

In [None]:
"""
Asynchronous function to extract all albums from an artist's discography page.
  This function:
  1. Retrieves the page content using an asynchronous HTTP request
  2. Parses HTML content to identify the first <table> tag, which contains albums data
  3. Finds all <td> tags with class 'meta', where album links are stored
  4. Iterates through the anchor <a> tags to find href attributes
  5. Stores external (http) links into a set to avoid duplicates, then converts the set to a list
  6. Returns a list of all albums
"""
async def get_album(url):
    albums = set()

    content = await get_page(url)

    soup = BeautifulSoup(content, 'html.parser')

    table = soup.find('table')

    td_tags = table.find_all('td', class_='meta')
    for td in td_tags:
        a_tags = td.find_all('a')
        for a in a_tags:
            href = a.get('href')
            if href.startswith('//'):
                continue
            if href.startswith('http'):
                albums.add(href)

    albums_list = list(albums)

    return albums_list

### Get all songs from album

In [None]:
"""
Asynchronous function to extract all songs from album link from a list of albums.
  This function:
  1. Retrieves the page content using an asynchronous HTTP request
  2. Parses HTML content to identify all <div> tags with class 'disc', which contains songs data
  3. Finds all <div> tags with class 'title', where song titles are stored
  4. Iterates through the title <div> tags to find href attributes
  5. Stores external (http) links into a set to avoid duplicates, then converts the set to a list
  6. Returns a list of all albums
"""
async def get_all_songs(album):
    href_set = set()

    content = await get_page(album)
    soup = BeautifulSoup(content, 'html.parser')

    divs_disc = soup.find_all('div', class_='disc')

    for div_disc in divs_disc:
        title_divs = div_disc.find_all('div', class_='title')

        for title_div in title_divs:
            a_tags = soup.find_all('a')
            for a in a_tags:
                href = a.get('href')
                if 'allmusic.com/song' in href:
                    href_set.add(href)

    href_list = list(href_set)

    return href_list

### Get all track information

In [None]:
"""
Asynchronous function to extract information about the track as a dictionary: track name, composer, genres, styles, moods and themes.
  This function:
  1. Retrieves the page content using an asynchronous HTTP request
  2. Parses HTML content to identify all <div> tags and <h1> tag, which contain songs and title data
  3. Stores results in lists: composers, genres, styles, moods and themes
  4. Return a dictionary containing all the extracted information
"""
async def get_track_info(song):

    content = await get_page(song)
    soup = BeautifulSoup(content, 'html.parser')

    track_name_tag = soup.find('h1')
    track_name = track_name_tag.get_text(strip=True)

    composers_tag = soup.find('div', class_='composer')
    composers = ', '.join([a.get_text(strip=True) for a in composers_tag.find_all('a')]) if composers_tag else None

    genres_tag = soup.find('div', class_='genre')
    genres = ', '.join([a.get_text(strip=True) for a in genres_tag.find_all('a')]) if genres_tag else None

    styles_tag = soup.find('div', class_='styles')
    styles = ', '.join([a.get_text(strip=True) for a in styles_tag.find_all('a')]) if styles_tag else None

    moods_tag = soup.find('div', id='moodsGrid')
    moods_links = moods_tag.find_all('a') if moods_tag else []
    moods = [link.get_text(strip=True).split('(')[0].strip() for link in moods_links]

    themes_tag = soup.find('div', id='themesGrid')
    themes_links = themes_tag.find_all('a') if themes_tag else []
    themes = [link.get_text(strip=True).split('(')[0].strip() for link in themes_links]

    return {
        'track_name' : track_name,
        'composers' : composers,
        'genres' : genres,
        'styles' : styles,
        'moods' : ', ' .join(moods),
        'themes' : ', ' .join(themes),
    }

### Get all album information

In [None]:
"""
Asynchronous function to extract information about albums: album name, user rating, release date, album duration, genres, styles, recording location, recording date, moods and themes
 This function:
  1. Retrieves the page content using an asynchronous HTTP request
  2. Parses HTML content to identify various types of html tags, which contain album data
  3. Stores results in lists: rating, date, duration, album_genres, album_styles, location, recording_period, album_moods, album_themes
  4. Return a dictionary containing all the extracted information
"""
async def get_album_info(album):

    content = await get_page(album)
    soup = BeautifulSoup(content, 'html.parser')

    album_name = soup.find('h1')

    user_rating = soup.find('div', class_=lambda value: value and value.startswith('averageUserRating ratingAverage'))
    rating_class = next((cls for cls in user_rating['class'] if cls.startswith('ratingAverage')), None)
    if rating_class:
        rating = int(rating_class.replace('ratingAverage', '').lstrip('0') or '0')
    else:
        rating = 0

    release_date = soup.find('div', class_='release-date')
    date = release_date.find('span') if release_date else None

    album_duration = soup.find('div', class_='duration')
    duration = album_duration.find('span') if album_duration else None

    genres_tag = soup.find('div', class_='genre')
    album_genres = ', '.join([a.get_text(strip=True) for a in genres_tag.find_all('a')]) if genres_tag else None

    styles_tag = soup.find('div', class_='styles')
    album_styles = ', '.join([a.get_text(strip=True) for a in styles_tag.find_all('a')]) if styles_tag else None

    recording_location = soup.find('div', class_='recording-location')
    location = recording_location.find('div') if recording_location else None

    recording_date = soup.find('div', class_='recording-date')
    recording_period = recording_date.find('div') if recording_date else None

    moods_tag = soup.find('div', id='moodsGrid')
    moods_links = moods_tag.find_all('a') if moods_tag else []
    album_moods = [link.get_text(strip=True).split('(')[0].strip() for link in moods_links]

    themes_tag = soup.find('div', id='themesGrid')
    themes_links = themes_tag.find_all('a') if themes_tag else []
    album_themes = [link.get_text(strip=True).split('(')[0].strip() for link in themes_links]

    return {
        'album_name' : album_name.text if album_name else None,
        'user_rating' : rating,
        'album_release_date' : date.text if date else None,
        'album_duration' : duration.text if duration else None,
        'album_genre' : album_genres,
        'album_styles' : album_styles,
        'album_recording_location' : location.text if location else None,
        'album-recording-date' : recording_period.text if recording_period else None,
        'album_moods' : album_moods,
        'album_themes' : album_themes,
    }

In [None]:
album_info = await get_album_info('https://www.allmusic.com/album/long-may-you-run-mw0000197512#moodsThemes')
print(album_info)

### Get all informaton about songs and albums

In [None]:
"""
Asynchronous function to extract information about all songs from given album
 This function:
  1. Iterates through all_albums, which contains discography URLs of the given artists
  2. Fetches detailed information about the album by calling the function get_album_info().
  3. Fetches all songs from the given album by calling the function get_all_songs().
  4. Iterates through songs, and fetches detailed information about the songs by calling the function get_track_info().
  5. Stores results in a dictionary album_data, and then appends it to the list albums_data for each album
  6. Return a list albums_data containing all the extracted information
"""
async def get_artist_data(url):
    all_albums = await get_album(url)
    print('all albums')
    print(all_albums)

    albums_data = []

    for album_url in all_albums:
        print('current album url ' + album_url)
        album_info = await get_album_info(album_url + '#moodsThemes') # '#moodsThemes' anchor is added to ensure relevant sections of the album page are loaded
        songs = await get_all_songs(album_url)

        tracks_data = []

        for song_url in songs:
            track_info = await get_track_info(song_url + '#moodsThemes') # '#moodsThemes' anchor is added to ensure relevant sections of the album page are loaded
            tracks_data.append(track_info)

        album_data = {
            'album' : album_info,
            'songs' : tracks_data
        }
        albums_data.append(album_data)

    return albums_data


In [None]:
# albums_data = await get_artist_data('https://www.allmusic.com/artist/crosby-stills-nash-young-mn0000130036#discography')
# albums_data_david_crosby = await get_artist_data('https://www.allmusic.com/artist/david-crosby-mn0000644880#discography')
# albums_data_stephen_stills = await get_artist_data('https://www.allmusic.com/artist/stephen-stills-mn0000021744#discography')
# albums_graham_nash = await get_artist_data('https://www.allmusic.com/artist/graham-nash-mn0000153590#discography')
# albums_neil_young = await get_artist_data('https://www.allmusic.com/artist/neil-young-mn0000379125#discography')
# albums_crosby_nash = await get_artist_data('https://www.allmusic.com/artist/crosby-nash-mn0000846357#discography')
# albums_crosby_stills_nash = await get_artist_data('https://www.allmusic.com/artist/crosby-stills-nash-mn0000131581#discography')
albums_stills_young_band = await get_artist_data('https://www.allmusic.com/artist/stills-young-band-mn0002151611#discography')

all albums
['https://www.allmusic.com/album/long-may-you-run-mw0000197512']
current album url https://www.allmusic.com/album/long-may-you-run-mw0000197512


### Create DataFrame

In [None]:
"""
Function to create a DataFrame from the list of album data.
  This function:
  1. Iterates through albums_data, and extracts album details
  2. Iterates through songs in the current album's song list
  3. Creates dictionary 'row' which contains information about a track and its corresponding album
  4. Stores the results into a list data
  5. Creates DataFrame from the list, and returns the constructed DataFrame
"""
def create_dataframe(albums_data):
    data = []

    for album_data in albums_data:
        album_info = album_data['album']
        for song_info in album_data['songs']:
            row = {
                'album_name': album_info['album_name'],
                'user_rating' : album_info['user_rating'],
                'album_release_date': album_info['album_release_date'],
                'album_duration': album_info['album_duration'],
                'album_genre': album_info['album_genre'],
                'album_styles': album_info['album_styles'],
                'album_recording_location': album_info['album_recording_location'],
                'album_recording_period': album_info['album-recording-date'],
                'album_moods': ', '.join(album_info['album_moods']),
                'album_themes': ', '.join(album_info['album_themes']),
                'track_name': song_info['track_name'],
                'composers': song_info['composers'],
                'genres': song_info['genres'],
                'styles': song_info['styles'],
                'moods': song_info['moods'],
                'themes': song_info['themes'],
            }
            data.append(row)
    df = pd.DataFrame(data)
    return df

In [None]:

# df_csny = create_dataframe(albums_data)
# df_david_crosby = create_dataframe(albums_data_david_crosby)
# df_stephen_stills = create_dataframe(albums_data_stephen_stills)
# df_graham_nash = create_dataframe(albums_graham_nash)
# df_neil_young = create_dataframe(albums_neil_young)
# df_crosby_nash = create_dataframe(albums_crosby_nash)
# df_crosby_stills_nash = create_dataframe(albums_crosby_stills_nash)
df_stills_young_band = create_dataframe(albums_stills_young_band)

### Save DataFrame as .csv file

In [None]:
"""
Function to save a DataFrame to a CSV file
"""
def save_as_csv(df, filename):
    df.to_csv(filename, index=False)

In [None]:

# save_as_csv(df_david_crosby, 'allmusic/allmusic-david-crosby.csv')
# save_as_csv(df_stephen_stills, 'allmusic/allmusic-stephen-stills.csv')
# save_as_csv(df_graham_nash, 'allmusic/allmusic-graham-nash.csv')
# save_as_csv(df_neil_young, 'allmusic/allmusic-neil-young.csv')
# save_as_csv(df_crosby_stills_nash, 'allmusic/allmusic-crosby-stills-nash.csv')
save_as_csv(df_stills_young_band, 'allmusic-stills-young-band.csv')


### Combine all datasets to allmusic.csv

In [None]:
"""
Combine all datasets to allmusic.csv
"""
df_david_crosby = pd.read_csv('allmusic/allmusic-david-crosby.csv')
df_stephen_stills = pd.read_csv('allmusic/allmusic-stephen-stills.csv')
df_nash_graham = pd.read_csv('allmusic/allmusic-graham-nash.csv')
df_neil_young = pd.read_csv('allmusic/allmusic-neil-young.csv')
df_crosby_nash = pd.read_csv('allmusic/allmusic-crosby-nash.csv')
df_crosby_stills_nash = pd.read_csv('allmusic/allmusic-crosby-stills-nash.csv')
df_csny = pd.read_csv('allmusic/allmusic-csny.csv')

df_allmusic = pd.concat([df_david_crosby, df_stephen_stills, df_nash_graham, df_neil_young, df_crosby_nash, df_crosby_stills_nash, df_csny])
print(df_allmusic)

save_as_csv(df_allmusic, 'allmusic/allmusic.csv')
