# 1. Most collected releases
Start from this page https://www.discogs.com/search/?limit=250&sort=have%2Cdesc&ev=gs_mc&type=release&layout=sm&page=1

1. Go through all items in `<div class="cards cards_layout_text-only" id="search_results">`
2. Within each card, extract the link to the release page `<a class="search_result_title " href="/release/4570366-Daft-Punk-Random-Access-Memories" data-followable="true">Random Access Memories</a>`
3. Create an array/dataframe with 
- Artist
- Release Title
- Release URL
4. Paginate 40 times to get the 10,000 most collected releases
`https://www.discogs.com/search/?limit=250&sort=have%2Cdesc&ev=gs_mc&type=release&layout=sm&page=2`

In [10]:
!pip install cchardet

Defaulting to user installation because normal site-packages is not writeable
Collecting cchardet
  Downloading cchardet-2.1.7-cp39-cp39-macosx_10_9_x86_64.whl (124 kB)
[K     |████████████████████████████████| 124 kB 619 kB/s eta 0:00:01
[?25hInstalling collected packages: cchardet
Successfully installed cchardet-2.1.7


In [1]:
"""  
Dataset generator module.  
Includes functions for web scraping artists discographies from Discogs.com  
"""  
import math  
import time  
import re  
from contextlib import closing  
from requests import get  
from bs4 import BeautifulSoup as bs
import pandas as pd
import lxml
import cchardet

In [2]:
def get_soup(url, link_type="release"):  
    """  
    
    Downloads webpage, makes soup and returns it.
    
    Args:  
    
        url (str): url for webpage  
    
    Returns: 
    
        obj: soup object representing html page 
    
    """   
    
    while True:  
        # download page  
        with closing(get(url, stream=True)) as resp:  
            # decode bytes object to string  
            html = resp.content.decode("utf-8")  
            soup = bs(html, "lxml")  
        # get page title  
      #title = soup.title.string  
    
      ## check if too many requests were made  
    
      #if "Error 429" in title:  
      #    # if yes, wait for 10 seconds and try again  
      #    print("Too many requests made. Waiting 10 sec and trying again.")  
      #    time.sleep(10)  
    
      #else:  
            # return soup of page if no error occured 
        if link_type == "list":
            return soup.find("div", {"id": "search_results"})
        elif link_type == "release":
            release_data = []
            release_data.append(soup.find("div", {"class": re.compile("info_")}))
            release_data.append(soup.find("section", {"id" : "release-stats"}))
            return release_data
        else:
            return soup

In [3]:
urltofetch = "https://www.discogs.com/search/?limit=250&sort=have%2Cdesc&ev=gs_mc&type=release&layout=sm&page=2&format_exact=Vinyl&format_exact=LP"
search_results = get_soup(urltofetch,"list")



In [4]:
pagesToFetch = 100
itemsPerPage = 100
titles = []
artists = []
release_links = []

page = 1
while True:

    urltofetch = "https://www.discogs.com/search/?limit=" + str(itemsPerPage) + "&sort=have%2Cdesc&ev=gs_mc&type=release&layout=sm&page=" + str(page) + "&format_exact=Vinyl&format_exact=LP"
    print("fetching page ", page)
    search_results = get_soup(urltofetch,"list")
    cards = search_results.findAll("div",{"class":"card"})

    for release in cards:
        release_data = release.find("a", {"class": "search_result_title"}) 
        artist_data = release.find("a", href=re.compile("artist"))

        release_links.append("https://www.discogs.com" + release_data["href"])
        titles.append(release_data.getText())
        artists.append(artist_data.getText())
    remainingPages = pagesToFetch - page
    print("fetching next page, remaining pages : ", remainingPages-1)
    if remainingPages == 0:
        break
    page += 1


all_releases = { "artist" : artists, "title" : titles, "link_to_release" : release_links}

data_releases = pd.DataFrame.from_dict(all_releases)


fetching page  1
fetching next page, remaining pages :  98
fetching page  2
fetching next page, remaining pages :  97
fetching page  3
fetching next page, remaining pages :  96
fetching page  4
fetching next page, remaining pages :  95
fetching page  5
fetching next page, remaining pages :  94
fetching page  6
fetching next page, remaining pages :  93
fetching page  7
fetching next page, remaining pages :  92
fetching page  8
fetching next page, remaining pages :  91
fetching page  9
fetching next page, remaining pages :  90
fetching page  10
fetching next page, remaining pages :  89
fetching page  11
fetching next page, remaining pages :  88
fetching page  12
fetching next page, remaining pages :  87
fetching page  13
fetching next page, remaining pages :  86
fetching page  14
fetching next page, remaining pages :  85
fetching page  15
fetching next page, remaining pages :  84
fetching page  16
fetching next page, remaining pages :  83
fetching page  17
fetching next page, remaining p

In [5]:
data_releases

Unnamed: 0,artist,title,link_to_release
0,Daft Punk,Random Access Memories,https://www.discogs.com/release/4570366-Daft-P...
1,Kendrick Lamar,"Good Kid, M.A.A.d City",https://www.discogs.com/release/3975953-Kendri...
2,Michael Jackson,Thriller,https://www.discogs.com/release/2911293-Michae...
3,Pink Floyd,The Dark Side Of The Moon,https://www.discogs.com/release/9287809-Pink-F...
4,Fleetwood Mac,Rumours,https://www.discogs.com/release/526351-Fleetwo...
...,...,...,...
9995,Minor Threat,Minor Threat,https://www.discogs.com/release/1267538-Minor-...
9996,Carole King,Wrap Around Joy,https://www.discogs.com/release/1602932-Carole...
9997,Tom Jones,This Is Tom Jones,https://www.discogs.com/release/1804019-Tom-Jo...
9998,Pixies,Bossanova,https://www.discogs.com/release/664574-Pixies-...


# 2. Get Release Details and stats

1. Go through each instance in the dataframe of MostCollectedRecords10000 and soup the URL
2. Extract release details from `<table class="table_1fWaB">` selector : class = "table_*"
3. Extract release stats from `<section id="release-stats" class="section_9nUx6 open_BZ6Zt">` selector : id = release-stats
4. Add this data to the original dataframe for each release

In [6]:
def scrape_release(url):  
    """  
    Gets soup of release page, gets necessary data and returns it. 

    Args:  
        url (str): url to release  

    Raises:  
        TypeError: if price retrieved is not in USD format  

    Returns:  
        dict: data scraped from discogs release page  

    """  

    # create dict and soup object  
    data = dict()  
    soup = get_soup(url) 
    
    #get release date
    try:
        date_of_release = soup[0].find("time")
        release_date_full = date_of_release.getText().strip().split(",")
        data["year"] = release_date_full[1].strip() if len(release_date_full) == 2 else release_date_full[0].strip()
        data["date"]=date_of_release["datetime"]
    except AttributeError:
        data["year"] = None
        data["date"] = None
        
    #get genres
    try:
        genre_links = soup[0].findAll("a",{"href":re.compile("/genre/")})
        genres = [link.getText() for link in genre_links]
        data["genres_all"] = genres
    except AttributeError:
        data["genres_all"] = None
    
    #get styles
    try:
        style_links = soup[0].findAll("a",{"href":re.compile("/style/")})
        styles = [link.getText() for link in style_links]
        data["styles_all"] = styles
    except AttributeError:
        data["styles_all"] = None
    
    #get countries
    try:
        country_links = soup[0].find("a",{"href":re.compile("country")})
        data["countries_all"] = country_links.getText()
    except AttributeError:
        data["countries_all"] = None
    frmt = []
    #get format details
    try:
        release_format = soup[0].findAll("div",{"class":re.compile("format_item")})
        format_all = []
        for line in release_format:
            clean_line = line.getText().lower().replace("\n", " ").strip()
            frmt = clean_line.split(",")
            frmt = [s.strip() for s in frmt]
            format_all.append(frmt)
        data["format_all"] = format_all
    except AttributeError:
        data["format_all"] = None
    
    # check if reissue
    try:
        data["issue_type"] = 1 #original  
        words = ["reissue", "reprint", "repress"]  
        if any(x in frmt for x in words):  
            data["issue_type"] = 2 #reissue
    except AttributeError:
        data["issue_type"] = None
    
    # check if limited
    try:
        data["limited"] = 0 #not limited  
        words = ["limited", "limited edition", "ltd"]  
        if any(x in frmt for x in words):  
            data["limited"] = 1 #limited edition
    except AttributeError:
        data["limited"] = None
    
    # check if picture disc
    try:
        data["picture_disc"] = 0 #not picture disc  
        words = ["picture", "picture disc", "pic"]  
        if any(x in frmt for x in words):  
            data["picture_disc"] = 1 #is a picture disc
    except AttributeError:
        data["picture_disc"] = None

    # check if box set
    try:
        data["box_set"] = 1 if "box" in frmt else 0  
    except AttributeError:
        data["box_set"] = None

    # check if numbered 
    try:
        data["numbered"] = 1 if "numbered" in frmt else 0
    except AttributeError:
        data["numbered"] = None

    # check if test pressing  
    try:
        data["test_pressing"] = 1 if "test pressing" in frmt else 0  
    except AttributeError:
        data["test_pressing"] = None 

    # check if promotional pressing
    try:
        data["promo"] = 1 if "promo" in frmt else 0
    except AttributeError:
        data["promo"] = None
    
    # check for mono/stereo
    try:
        data["channels"] = 2 if "stereo" in frmt else 1 #number of channels, 2 for stereo, 1 for mono records
    except AttributeError:
        data["channels"] = None
    
    # check for gatefold
    try:
        data["gatefold"] = 0 #not gatefold  
        gatefold_words = ["gatefold", "gatefold, 180g", "180g, gatefold"] 
        if any(x in frmt for x in gatefold_words):  
            data["gatefold"] = 1 #gatefold release
    except AttributeError:
        data["gatefold"] = None
        
    # check if colored vinyl
    try:
        data["colored"] = 0  
        italics = soup[0].findAll("i")  
        # phrases indicating something else than colored vinyl  
        not_color = ["gatefold", "lenticular", "180g", "autographed", "signed",  
        "gatefold, 180g", "180g, gatefold", "180 gram", "numbered",  
        "hand numbered", "black", "black vinyl", "single"]  

        # loop through italic phrases  
        for i in italics:  
            i = i.get_text().lower()  
            # continue to next word if exact match with non-vinyl indicator  
            if i in not_color:  
                continue  
            # set colored to 1  
            data["colored"] = 1
    except AttributeError:
        data["colored"] = None
        
    # check for median selling price

    try:  
        stats = soup[1].findAll("li")
        
        haves = stats[0].find("a").getText()
        data["haves"] = haves
        
        wants = stats[1].find("a").getText()
        data["wants"] = wants
        
        avg_rating = stats[2].findAll("span")[1].getText().split("/")[0].strip()
        data["avg_rating"] = avg_rating
        
        num_ratings = stats[3].find("a").getText().strip()
        data["num_ratings"] = num_ratings
        
        median_price = stats[6].findAll("span")[1].getText()
        data["median_price"] = median_price
        
        lowest_price = stats[5].findAll("span")[1].getText()
        data["lowest_proice"] = lowest_price
        
        max_price = stats[7].findAll("span")[1].getText()
        data["max_price"] = max_price

    except AttributeError:  
        data["median_price"] = None
        data["lowest_proice"] = None
        data["max_price"] = None
        data["haves"] = None
        data["wants"] = None

    return data
release_url = "https://www.discogs.com/release/550017-Bob-Marley-The-Wailers-Uprising"
scrape_release(release_url)

{'year': None,
 'date': None,
 'genres_all': None,
 'styles_all': None,
 'countries_all': None,
 'format_all': None,
 'issue_type': 1,
 'limited': 0,
 'picture_disc': 0,
 'box_set': 0,
 'numbered': 0,
 'test_pressing': 0,
 'promo': 0,
 'channels': 1,
 'gatefold': 0,
 'colored': None,
 'median_price': None,
 'lowest_proice': None,
 'max_price': None,
 'haves': None,
 'wants': None}

In [7]:
rows = []
for index, row in data_releases.iterrows():
    release_details = []
    if index % 100 == 0:
        print(index, "of 10,000 records fetched")
        print("Current release : ", row['link_to_release'])
    #add artist to row
    release_details.append(row['artist'])
    
    #add title to row
    release_details.append(row['title'])
    
    #add link to release to row
    release_details.append(row['link_to_release'])
    
    #scrape release
    scraped_release = scrape_release(row['link_to_release'])
    
    for key in scraped_release:
        release_details.append(scraped_release[key])
    
    rows.append(release_details)
    
    if index == 2500:
        break
print("wait 30 seconds")
time.sleep(30)
for index, row in data_releases.iterrows():
    if index < 2500:
        continue
    release_details = []
    if index % 100 == 0:
        print(index, "of 10,000 records fetched")
        print("Current release : ", row['link_to_release'])
    #add artist to row
    release_details.append(row['artist'])
    
    #add title to row
    release_details.append(row['title'])
    
    #add link to release to row
    release_details.append(row['link_to_release'])
    
    #scrape release
    scraped_release = scrape_release(row['link_to_release'])
    
    for key in scraped_release:
        release_details.append(scraped_release[key])
    
    rows.append(release_details)
    
    if index == 5000:
        break
print("wait 30 seconds")
time.sleep(30)
for index, row in data_releases.iterrows():
    if index < 5000:
        continue
    release_details = []
    if index % 100 == 0:
        print(index, "of 10,000 records fetched")
        print("Current release : ", row['link_to_release'])
    #add artist to row
    release_details.append(row['artist'])
    
    #add title to row
    release_details.append(row['title'])
    
    #add link to release to row
    release_details.append(row['link_to_release'])
    
    #scrape release
    scraped_release = scrape_release(row['link_to_release'])
    
    for key in scraped_release:
        release_details.append(scraped_release[key])
    
    rows.append(release_details)
    
    if index == 7500:
        break
print("wait 30 seconds")
time.sleep(30)
for index, row in data_releases.iterrows():
    if index < 7500:
        continue
    release_details = []
    if index % 100 == 0:
        print(index, "of 10,000 records fetched")
        print("Current release : ", row['link_to_release'])
    #add artist to row
    release_details.append(row['artist'])
    
    #add title to row
    release_details.append(row['title'])
    
    #add link to release to row
    release_details.append(row['link_to_release'])
    
    #scrape release
    scraped_release = scrape_release(row['link_to_release'])
    
    for key in scraped_release:
        release_details.append(scraped_release[key])
    
    rows.append(release_details)


all_release_data = pd.DataFrame(rows, columns=["artist", "title","url","year","date","genres","styles","countries","formats",
                                               "issue_type","limited","picture_disc","box_set","numbered","test_pressing","promo","channels","gatefold",
                                              "colored","haves","wants","avg_rating","num_ratings","price_median","price_lowest","price_highest"])

all_release_data

0 of 10,000 records fetched
Current release :  https://www.discogs.com/release/4570366-Daft-Punk-Random-Access-Memories
100 of 10,000 records fetched
Current release :  https://www.discogs.com/release/4819086-Nirvana-Nevermind
200 of 10,000 records fetched
Current release :  https://www.discogs.com/release/6191421-Iron-Maiden-The-Number-Of-The-Beast
300 of 10,000 records fetched
Current release :  https://www.discogs.com/release/3917263-Godspeed-You-Black-Emperor-Allelujah-Dont-Bend-Ascend
400 of 10,000 records fetched
Current release :  https://www.discogs.com/release/10901377-Phoebe-Bridgers-Stranger-In-The-Alps
500 of 10,000 records fetched
Current release :  https://www.discogs.com/release/508401-The-B-52s-The-B-52s
600 of 10,000 records fetched
Current release :  https://www.discogs.com/release/3577011-The-Lumineers-The-Lumineers
700 of 10,000 records fetched
Current release :  https://www.discogs.com/release/2709851-PJ-Harvey-Let-England-Shake
800 of 10,000 records fetched
Curren

Unnamed: 0,artist,title,url,year,date,genres,styles,countries,formats,issue_type,...,channels,gatefold,colored,haves,wants,avg_rating,num_ratings,price_median,price_lowest,price_highest
0,Daft Punk,Random Access Memories,https://www.discogs.com/release/4570366-Daft-P...,2013,2013-05-17,"[Electronic, Funk / Soul, Pop]","[Disco, Funk, Synth-pop]","UK, Europe & US","[[2 x vinyl, lp, album, stereo, 180 gram]]",1,...,2,0,0.0,53521,18838,4.56,6337,€45.00,€21.11,€189.99
1,Kendrick Lamar,"Good Kid, M.A.A.d City",https://www.discogs.com/release/3975953-Kendri...,2012,2012-10-22,[Hip Hop],[Conscious],US,"[[2 x vinyl, lp, album, deluxe edition]]",1,...,1,0,0.0,41176,12493,4.66,2957,€20.90,€13.29,€35.76
2,Michael Jackson,Thriller,https://www.discogs.com/release/2911293-Michae...,1982,1982-11-30,"[Funk / Soul, Pop]","[Contemporary R&B, Disco, Soul]",US,"[[vinyl, lp, album, stereo, pitman pressing, g...",1,...,2,1,1.0,40452,9254,4.48,2744,€15.06,€4.53,€99.99
3,Pink Floyd,The Dark Side Of The Moon,https://www.discogs.com/release/9287809-Pink-F...,2016,2016-11-04,[Rock],"[Prog Rock, Psychedelic Rock, Classic Rock]",Europe,"[[vinyl, lp, album, reissue, remastered, stere...",2,...,2,1,0.0,38314,3961,4.83,4201,€23.62,€19.90,€33.25
4,Fleetwood Mac,Rumours,https://www.discogs.com/release/526351-Fleetwo...,1977,1977,[Rock],"[Soft Rock, Classic Rock]",US,"[[vinyl, lp, album, los angeles pressing]]",1,...,1,0,1.0,35108,8351,4.5,2713,€21.84,€2.85,€189.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Minor Threat,Minor Threat,https://www.discogs.com/release/1267538-Minor-...,1987,1987,[Rock],"[Hardcore, Punk]",US,"[[vinyl, lp, compilation, repress, $5, green c...",2,...,1,0,1.0,2290,532,4.59,160,€34.99,€13.00,€47.50
9999,Carole King,Wrap Around Joy,https://www.discogs.com/release/1602932-Carole...,1974,1974,[Rock],[Pop Rock],US,"[[vinyl, lp, album, terre haute pressing]]",1,...,1,0,1.0,2289,82,3.58,111,€2.57,€0.99,€8.00
10000,Tom Jones,This Is Tom Jones,https://www.discogs.com/release/1804019-Tom-Jo...,1969,1969,[Pop],"[Ballad, Vocal]",US,"[[vinyl, lp, album]]",1,...,1,0,0.0,2292,31,3.5,84,€2.33,€0.93,€7.60
10001,Pixies,Bossanova,https://www.discogs.com/release/664574-Pixies-...,1990,1990,[Rock],[Indie Rock],UK,"[[vinyl, lp, album]]",1,...,1,0,0.0,2291,1286,4.26,185,€34.50,€9.00,€60.00


In [8]:
all_release_data.describe()

Unnamed: 0,issue_type,limited,picture_disc,box_set,numbered,test_pressing,promo,channels,gatefold,colored
count,10003.0,10003.0,10003.0,10003.0,10003.0,10003.0,10003.0,10003.0,10003.0,9998.0
mean,1.244327,0.061981,0.003899,0.0,0.004299,0.0,0.0008,1.281915,0.155953,0.382777
std,0.429709,0.241134,0.062322,0.0,0.065427,0.0,0.02827,0.449955,0.362829,0.486089
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0
max,2.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,1.0,1.0


In [9]:
all_release_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10003 entries, 0 to 10002
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artist         10003 non-null  object 
 1   title          10003 non-null  object 
 2   url            10003 non-null  object 
 3   year           9852 non-null   object 
 4   date           9852 non-null   object 
 5   genres         9998 non-null   object 
 6   styles         9998 non-null   object 
 7   countries      9928 non-null   object 
 8   formats        9998 non-null   object 
 9   issue_type     10003 non-null  int64  
 10  limited        10003 non-null  int64  
 11  picture_disc   10003 non-null  int64  
 12  box_set        10003 non-null  int64  
 13  numbered       10003 non-null  int64  
 14  test_pressing  10003 non-null  int64  
 15  promo          10003 non-null  int64  
 16  channels       10003 non-null  int64  
 17  gatefold       10003 non-null  int64  
 18  colore

In [10]:
all_release_data["artist"].value_counts()

The Beatles           193
Various               180
Pink Floyd            166
David Bowie           130
Led Zeppelin           99
                     ... 
Max Richter             1
The Vaccines            1
MC Solaar               1
The Staple Singers      1
Tom Jones               1
Name: artist, Length: 2081, dtype: int64

In [11]:
all_release_data.isnull().sum()

artist             0
title              0
url                0
year             151
date             151
genres             5
styles             5
countries         75
formats            5
issue_type         0
limited            0
picture_disc       0
box_set            0
numbered           0
test_pressing      0
promo              0
channels           0
gatefold           0
colored            5
haves              5
wants              5
avg_rating         5
num_ratings        5
price_median       5
price_lowest       5
price_highest      5
dtype: int64

In [12]:
all_release_data.to_csv("datasets/all_release_data_most_collected_08052022_1200.csv")