In [1]:
"""  
Dataset generator module.  
Includes functions for web scraping artists discographies from Discogs.com  
"""  
import math  
import time  
import re  
from contextlib import closing  
from requests import get  
from bs4 import BeautifulSoup as bs  

In [10]:
class Release:  
    """  
    Release class.  
    Scrapes discogs and stores data about release.  
    """  
    def __init__(self, url):  
        """  
        Gets data and stores in variables.  
        Args:  
        url (str): url to release  
        """  
        if url:  
        # scrape release data from discogs  
            data = scrape_release(url)  
            self.url = url  
            self.discogs_release_id = get_id(url)  
            self.title = data["title"]  
            self.label = data["label"]  
            self.year = data["year"]  
            self.country = data["country"]  
            self.format_all = data["format_all"]  
            self.issue_type = data["issue_type"]  
            self.limited = data["limited"]  
            self.picture_disc = data["picture_disc"]  
            self.box_set = data["box_set"]  
            self.numbered = data["numbered"]  
            self.test_pressing = data["test_pressing"]  
            self.promo = data["promo"]  
            self.colored = data["colored"]  
            self.price = data["price"]  
        else:  
        # leave attributes blank until later  
            self.url = None  
            self.discogs_release_id = None  
            self.title = None  
            self.label = None  
            self.year = None  
            self.country = None  
            self.format_all = None  
            self.issue_type = None  
            self.limited = None  
            self.picture_disc = None  
            self.box_set = None  
            self.numbered = None  
            self.test_pressing = None  
            self.promo = None  
            self.colored = None
            self.price = None  
   
        # gets set after init  
        self.artist = None  
        self.discogs_master_id = 0  
        self.format_type = None  
        self.versions = None  
        self.vinyl_exclusive = None  
        self.chronology = None  

In [13]:
def get_links(artist_url):  
     """  
     Gets links for all releases by an artist from Discogs.  
     Args:  
     artist_url (str): url for artists page, the part after 'arists/' in                      
    url  
     Returns:  
     list: list of links  
     """  
     # create base URLs for discogs pages  
     url = "https://www.discogs.com/artist/" + artist_url + "?sort=year%2Casc&limit=500&subtype=_format&" + "filter_anv=0&type=Releases&page="

     url_base = url[:23]  

     links = dict() # to store links to all items  
     
     # collect links for Albums and Singles  
     for frmt in ["albums", "singles"]:  
        # create correct word for use in url  
        frmt_text = "Albums" if frmt == "albums" else "Singles-EPs"  
        
        # get first page of artists discography for this category  
        pages = []  
        pages.append(get_soup(url.replace("_format", frmt_text) + "1"))  
        
        # continue if no results was found for category  
        if not results_found(pages[0]):  
            links[frmt] = None  
            continue  
        
        # find out how many results were found  
        soup = pages[0]  
        items = soup.find("strong", {"class": "pagination_total"})  
        items = int(items.get_text().strip().split()[-1].replace(",", ""))  
        
        # check total amount of pages  
        page_count = items/500  
        page_count = items/500 if not page_count%500 else math.ceil(items/500)  
        
        # download rest of pages if more than one  
        count = 1  
        while count < page_count:  
            pages.append(get_soup(url.replace("_format", frmt_text) +    
           str(count+1)))  
            count += 1  
        
        # create variables to store links  
        masters = list()  
        uniques = list()

        # loop through each page and save in correct list  
        for soup in pages:  
            # find all cards  
            cards = soup.findAll("tr", {"class": "card"})  
        
            # loop through cards and collect links  
            for card in cards:  
                # get link href and put in right list  
                link = card.find("a").get("href")  
                slash = link.rfind("/")  
                link_type = link[slash-7:slash]  
                
                if link_type == "/master":  
                    # for masters, make a dict entry storing a link and list  
                    # and bool (for storing vinyl exclusivity variable)  
                   masters.append([link, list(), 1])  
                
                elif link_type == "release":  
                # do not add if it is not a vinyl  
                    if is_valid_release(card):  
                       uniques.append(link)  
        
        # save lists in dicts for right formats  
        links[frmt] = dict()  
        links[frmt]["uniques"] = uniques  
        links[frmt]["masters"] = masters  

        # loop through masters and get links to releases  
        for frmt in ["albums", "singles"]:  
             # continue only if results were found  
            if not links[frmt]:  
                continue  
         
            for master in links[frmt]["masters"]:  
                # download master page and create bs object  
                soup = get_soup(url_base + master[0])  
                # get cards  
                cards = soup.findAll("tr", {"class": "card"})  
                # loop through cards  
                for card in cards:  
                    # set master[2] to False if non-vinyl item found among items  
                    if master[2] == 1:  
                        if not is_vinyl(card) and is_official(card):  
                            master[2] = 0  
                    # save links for valid items  
                    if is_valid_release(card):  
                        # get title td  
                        title_td = card.find("td", {"class": "title"})  
                        # get link that includes '/release/'  
                        title_link = title_td.select("a[href*=release]")[-1]  
                        master[1].append(title_link.get("href"))  
        return links 

In [14]:
def get_releases(artist_name, artist_url, to_csv=False):  
    """  
    Get Release objects for all releases by artist from Discogs.  
    Args:  
    artist_name (str): name of artist  
    artist_url (str): url for artists page, the part after 'arists/' in url  
    to_csv (bool): write results to file if True  
    Returns:  
    list: Release objects
    """

    links = get_links(artist_url)  
    url_base = "https://www.discogs.com"  
    
    # create list for all Release objects  
    releases = list()  
    # scrape release pages for data and create objects  
    for frmt in ["albums", "singles"]:  
        # continue only if results were found  
        if links[frmt]:  
            # unique releases  
            for link in links[frmt]["uniques"]:  
                # init object with basic data from discogs release page  
                release = Release(url_base + link)  
                # add extra data  
                release.artist = artist_name  
                release.format_type = frmt[:-1]  
                release.vinyl_exclusive = 1  
                release.issue_type = 1  
                release.versions = 0  
                releases.append(release)  
            # master releases  
            for mstr in links[frmt]["masters"]:  
                for link in mstr[1]:  
                    # init object with basic data from discogs release page  
                    release = Release(url_base + link)  
                    # add extra data  
                    release.artist = artist_name  
                    release.format_type = frmt[:-1]  
                    release.vinyl_exclusive = mstr[2]  
                    release.discogs_master_id = get_id(mstr[0])  
                    releases.append(release)
    # save to csv if to_csv is True  
    if to_csv:  
        with open(artist_name + ".txt", "w", encoding="utf-8") as file:  
            for release in releases:  
                file.write(f"{release.get_csv()}\n")  
    
    return releases  

In [15]:
def get_releases_from_csv(artist):  
    """  
    Reads from file and creates objects of releases in the same stage as  
    after the get_releases function, then returns them.  
    Primarily used in development stage.  

    Args:  
        artist (string): name of artist  

    Returns:  
        list: Release objects  

    """  
    releases = list()

    with open("data " + artist + ".txt") as data_file:  
        for line in data_file:  
            release = Release(None)  
            release.add_csv(line)  
            releases.append(release)
    return releases

In [35]:
def scrape_release(url):  
    """  
    Gets soup of release page, gets necessary data and returns it. 

    Args:  
        url (str): url to release  

    Raises:  
        TypeError: if price retrieved is not in USD format  

    Returns:  
        dict: data scraped from discogs release page  

    """  

    # create dict and soup object  
    data = dict()  
    soup = get_soup(url) 

    # find title
    title_re = re.compile('title_*')
    title = soup.find("h1", {"class": title_re})  
    data["title"] = title.findAll("span")[0].get_text().strip()
    print("Title: " , data)

    # find label  
    label = soup.find(text=re.compile("Label:")).find_parent()  
    label = label.find_next_sibling().find("a")  
    data["label"] = label.get_text() if label else None  

    # find year  
    year = soup.find(text=re.compile("Released:")).find_parent()  
    year = year.find_next_sibling().get_text().strip()[-4:]  
    data["year"] = int(year) if is_valid_int(year) else None 

    # find country  
    country = soup.find(text=re.compile("Country:")).find_parent()  
    data["country"] = country.find_next_sibling().get_text().strip()  

    # find format div  
    frmt = soup.find(text=re.compile("Format:")).find_parent()  
    frmt_soup = frmt.find_next_sibling() # save soup for later  
    frmt = frmt_soup.get_text().lower().replace("\n", " ")  
    frmt = re.sub(' +', ' ', frmt.strip())  
    data["format_all"] = frmt  

    # check if reissue  
    data["issue_type"] = None  
    words = ["reissue", "reprint", "repress"]  
    if any(x in frmt for x in words):  
        data["issue_type"] = 3  

    # check if limited  
    data["limited"] = 1 if "limited" in frmt else 0  

    # check if picture disc  
    data["picture_disc"] = 1 if "picture" in frmt else 0  

    # check if box set  
    data["box_set"] = 1 if "box" in frmt else 0  

    # check if numbered  
    data["numbered"] = 1 if "numbered" in frmt else 0

    # check if test pressing  
    data["test_pressing"] = 1 if "test pressing" in frmt else 0  

    # check if promotional pressing  
    data["promo"] = 1 if "promo" in frmt else 0

    # check if colored vinyl  
    data["colored"] = 0  
    italics = frmt_soup.findAll("i")  

    # phrases indicating something else than colored vinyl  
    not_color = ["gatefold", "lenticular", "180g", "autographed", "signed",  
    "gatefold, 180g", "180g, gatefold", "180 gram", "numbered",  
    "hand numbered", "black", "black vinyl", "single"]  

    # loop through italic phrases  
    for i in italics:  
        i = i.get_text().lower()  
        # continue to next word if exact match with non-vinyl indicator  
        if i in not_color:  
            continue  
        # set colored to 1  
        data["colored"] = 1  

    # check for median selling price  
    try:  
        price = soup.find(text=re.compile("Median:")).find_parent()  
        price = price.find_parent().get_text().replace("\n", " ")  
        price = re.sub(' +', ' ', price.strip())  
        data["price"] = price[8:]  
    except AttributeError:  
        data["price"] = None  
        
    # raise error if price is not in USD format  
    if data["price"] and data["price"][:1] != "$" and data["price"] != "--":  
        raise TypeError(f"Price is not in USD format. ({data['price']})")  

    return data

In [36]:
def post_scrape(releases):  
    """  

    For use after get_releases.  
    Calculate chronology (and add year if missing), add info about issue type  
    and amount of other versions available.  
    Remove items with no prices available. 

    Args:  
        releases (list): list of Release objects  

    Returns:  
        list: Release objects  

    """ 

    # get earliest and latest release years of artist  
    earliest_year, latest_year = get_earliest_and_latest_year(releases) 

    # get average year of the artists career  
    career_mean = get_mean(earliest_year, latest_year) 

    # make separate lists of uniques and masters  
    uniques = [x for x in releases if x.discogs_master_id == 0]  
    masters = [x for x in releases if x.discogs_master_id != 0] 

    # loop through uniques and calculate chronology and set missing years  
    for rel in uniques:
        calculate_chronology(rel, earliest_year, career_mean)  
    
    # make set for done masters  
    done = set()  
    
    # loop through masters and add data  
    for rel in masters:  
        # continue to next if master id is already done  
        if rel.discogs_master_id in done:  
            continue  
        
        # add id to done  
        done.add(rel.discogs_master_id)  
    
        # get releases of master  
        m_releases = get_releases_by_master_id(rel.discogs_master_id, masters)  
    
        try:  
            # try to get earliest and latest release year of master  
            m_earliest, m_latest = get_earliest_and_latest_year(m_releases)  
    
            # get average year of the master release  
            master_mean = get_mean(m_earliest, m_latest)  
        
        except AttributeError:  
            # no years found, use general artist year mean instead  
            master_mean = career_mean  

            m_earliest, m_latest = None, None 

        reissue_found = False  

        # loop through every release in master group  
        for mstr in m_releases:  
            # add issue type for reissues  
            if not mstr.year or mstr.year > m_earliest:  
                # set to reissue if release doesn't have year or if released  
                # later than earliest release for master  
                mstr.issue_type = 3  
        
            # calculate chronology and set missing years for all releases  
            # with current master id  
            calculate_chronology(mstr, earliest_year, master_mean)  
        
            # add other versions number  
            mstr.versions = len(m_releases)-1  
        
            # set reissue_found to True if reissue is found  
            if mstr.issue_type == 3:  
                reissue_found = True  
        
        # loop again and set issue type for originals  
        for mstr in m_releases:  
            # if release is from earliest year and not set yet  
            if mstr.issue_type is None:  
                # set to 1 (original without reissue) if no reissue found  
                # and to 2 (original with reissue) if reissue found  
                mstr.issue_type = 1 if not reissue_found else 2  
        
    # delete releases with price missing  
    releases = [x for x in releases if x.price and x.price != "--"]

    return releases   

In [37]:
def get_soup(url):  
    """  
    
    Downloads webpage, makes soup and returns it.
    
    Args:  
    
        url (str): url for webpage  
    
    Returns: 
    
        obj: soup object representing html page 
    
    """  
    
    print("Downloading URL: " + url)  
    
    while True:  
        # download page  
        with closing(get(url, stream=True)) as resp:  
            # decode bytes object to string  
            html = resp.content.decode("utf-8")  
            soup = bs(html, "html.parser")  
        # get page title  
        title = soup.title.string  
    
        # check if too many requests were made  
    
        if "Error 429" in title:  
            # if yes, wait for 10 seconds and try again  
            print("Too many requests made. Waiting 10 sec and trying again.")  
            time.sleep(10)  
    
        else:  
            # return soup of page if no error occured  
            return soup.find("div", {"id": "page"})

In [39]:
def results_found(soup):  
    """  

    Check for 404 error, if it is found, returns False, or else return True.  

    Args:  

        soup (obj): soup for full page  
    
    Returns:  
        boolean  
    
    """  

    if not soup.find(text=re.compile("404! Oh no!")):  
        return True  
    
    return False 

In [40]:
def get_releases_by_master_id(m_id, releases):  
    """  

    Return all releases with a certain master id.  

    Args: 

        m_id (int): discogs master id  

    releases (list): list of Release objects 

    Returns: 

        list: Release objects 

    """  
    return [x for x in releases if x.discogs_master_id == m_id]

In [41]:
def calculate_chronology(release, earliest, mean):  
    """ 

    Calculate chronology score and add year to object if missing.

    Args:  
    
        release (Release): object to calculate  
        earliest: earliest release year of artist  
        mean: mean value to use if year is missing  

    """  
    
    # set year for release if not set  
    release.year = mean if not release.year else release.year  

    now = this_year()  
    max_score = 20  
    span = now - earliest  

    # set chronology to 1 if released first year  
    release.chronology = 1 if release.year == earliest else release.chronology  
    
    # set chronology to max if released this year  
    release.chronology = max_score if release.year == now else release.chronology  

    # find release year relative to career start  
    year = release.year - earliest #0  
    year = year if year >= 1 else 1 #1  
    year = year if year <= span else span  

    # calculate score  
    score = int(round((year / span) * max_score))  
    score = score if score >= 1 else 1  
    score = score if score <= max_score else max_score  
    
    # set score in object if not set yet  
    release.chronology = score if not release.chronology else release.chronology 

In [42]:
def get_earliest_and_latest_year(releases):  
    """  

    Generate and return the earliest and latest occuring years among a list  
    of Release objects.

    Args: 

        releases: list of Release objects  
    
    Raises:  
        AttributeError: no year found among releases  
    
    Returns:  
        tuple: earliest and latest years 

    """  

    earliest = this_year()  
    latest = 0  
    found = False  

    # loop through releases and find years  
    for release in releases:  
        if is_valid_int(release.year):  
            found = True  
            if release.year < earliest:  
                earliest = release.year  
            if release.year > latest:  
                latest = release.year 

    # raise exception if no year was found  
    if not found:
        raise AttributeError("No year found among releases.")  

    return (earliest, latest)


In [43]:
def get_format_from_card(card):  
    """  

    Get string from html card that represents the items format.  

    Args:  
    
        card (obj): bs object representing discogs 'card'  
    
    Returns:  
        string  

    """  

    try:  
        res = card.find("span", {"class": "format"}).get_text()  

    except:  
        res = card.find("td", {"class": "title"}).get_text()  
        res = (res[res.index("("):res.index(")")+1])  
    
    return res 

In [44]:
def is_valid_release(card):  
    """  

    Check if discogs item is valid, i.e is a vinyl record and is  
    official (not a pirate/bootleg release). 

    Args:  
    
        card (obj): bs object representing discogs 'card'  
    
    Returns:  
        boolean  

    """  
    if not is_official(card):  
        return False  

    return is_vinyl(card) 

In [45]:
def is_official(card):  
    """  

    Check if discogs item is official or not.  
    
    Args:  
        card (obj): bs object representing discogs 'card'  
    
    Returns:  
        boolean  

    """  

    frmt = get_format_from_card(card)  

    # return false if unofficial item  
    if "Unofficial" in frmt:  
        return False  

    return True


In [46]:
def is_vinyl(card): 

    """  
    
    Check if discogs item is a vinyl item.  
    
    Args:  
    
        card (obj): bs object representing discogs 'card'
    
    Returns:
    
        boolean

    """

    frmt = get_format_from_card(card)  

    # strings to look for in format string  
    vinyl = ["LP", "7\"", "10\"", "12\""]  

    # return true if any of accepted strings appear in format string  
    if any(x in frmt for x in vinyl):  
        return True  

    return False  


In [47]:
def is_valid_int(int_string):  
    """ 

    Check if string represents a valid integer and return true/false. 

    Args: 

        int_string (str) 

    Returns:  
    
        boolean

    """  
    try:  
        int(int_string)  
        return True  

    except (ValueError, TypeError):  
        return False 

In [49]:
def get_id(url):  

    """  

    Returns number at end of string following a '/'  
    I.e "../12345" returns "12345"  

    Args:  

        url (str): url string  

    Returns:  

        int: id  

    """ 

    return int(url[url.rfind("/")+1:])  

In [50]:
def get_mean(earliest, latest):

    """  
    
    Calculates mean of two integers and returns it.  
    
    Args:  
    
        earliest (int)  
        latest (int)  
    
    Returns:  
        int: mean of two ints  
    
    """  

    return int(round(latest-((latest-earliest)/2)))  

In [51]:
def this_year():  
    """  

    Returns the current year as int. 

    Returns:  
        int: current year  
    """  

    return time.localtime().tm_year

In [52]:
def generate_artist_discography(db, artist_name, artist_url):

    """ 

    Generate dataset for artist, delete any existing rows from database of the  
    artist, and add the new dataset.  

    Args:

        db (Database): database object  
        artist_name (str): name of artist  
        artist_url (str): url to artists discogs profile  

    """  

    # scrape and generate data for artist  
    dataset = post_scrape(get_releases(artist_name, artist_url)) 

    # delete already existing entries of artist in database  
    db.delete_dataset(artist_name)  

    # add dataset to database  
    db.add_dataset(dataset)

    return len(dataset)

In [None]:
scrape_release("https://www.discogs.com/release/22990682-Fleddy-Melculy-De-Kerk-Van-Melculy")

Downloading URL: https://www.discogs.com/release/22990682-Fleddy-Melculy-De-Kerk-Van-Melculy
Title:  {'title': 'Fleddy Melculy'}


AttributeError: 'NoneType' object has no attribute 'find_parent'