In [1]:
import requests
import numpy as np
import re
import json
import time
import pandas as pd
from collections import deque
from bs4 import BeautifulSoup

# Data Collection and Cleaning from last.fm
For a music recommendation final project in Dr. Bodwin's DATA 301 course Winter 2022

## Method for Username Collection

I decided to implement neighborhood search first, as it was the simplest and created grounds for later methods. However, it is inherently limited by how many users I can use as "seeds." Initially, I simply used my username (aidan.bbq) and gathered my top 50 neighbors, then their top 50 neighbors, etc. 

However, to diversify the data, I then later add in usernames collected from the top artists of all time, and random artists collected along the way (top artists of usernames already collected). In the end, after an adequate amount of neighbor and artist searches, recursively adding and exploring new artists/usernames, I converged to a point where no new artists or users were being added (or perhaps 3 users for every 500 explored were added). Here, I considered the diversity of the dataset enough and ended the scraping there. 

For future analysis, randomly sampling the usernames should produce (smaller) but more representative data. Additionally, it would be interesting to see if this community which I converged to represents any sub community of the last.fm user database.

## API Logistics and Helper Functions

To gather username information from last.fm you must be logged in under a proper user account. As such, I provide my personal email and password to allow the web scraper to login and collect usernames. As this is sensitive information, it is contained locally in a "secrets" file

In [8]:
f=open("secrets.json")
s = json.load(f)
f.close()

#### Other constants

In [9]:
API_URL = 'https://ws.audioscrobbler.com/2.0/'

### Helper Functions for Common API Calls

In [33]:
def getLastFm(payload):
    # Sets up and gets information from the lastfm API
    
    # Announces username to API
    headers = {'user-agent': s["USER_AGENT"]}

    # Add API key and format to the payload
    payload['api_key'] = s["API_KEY"]
    payload['format'] = 'json'

    response = requests.get(API_URL, headers=headers, params=payload)
    return response

In [11]:
def getTopArtists(n=50):
    # Wrapper for lastfm API request for top artists of CHARTS (not user)
    
    payload = {
        "method": "chart.gettopartists",
        "limit": n
    }
    r = getLastFm(payload)
    artists = r.json()['artists']["artist"]
    return [a['name'] for a in artists]

In [13]:
def getUserInfo(user):
    # Gets selected information about a user
    
    payload = {
        "method": "user.getinfo",
        "user": user
    }
    r = getLastFm(payload)
    j = r.json()["user"]
    return [j["name"], j["country"], j["playcount"], j["registered"]["unixtime"], j["gender"]]

In [14]:
def getTopArtists_user(user, client, file, userid, artistids, periods=["overall"], n=50):
    # Gets the top $n artists of $user(id) for every time period in $periods using $client 
    # and writing to $file, compressing each artist name to index in #artistids
    
    # periods = ["overall", "1month", "12month"]
    output=[]
    for p in periods:
        payload = {
            "method": "user.gettopartists",
            "user": user,
            "period": p,
            "limit": n
        }
        r = getLastFm(payload)
        j = r.json()
        if "topartists" in j and "artist" in j["topartists"]:
            artists = r.json()['topartists']["artist"]
            for a in artists:
                if a['name'] not in artistids:
                    artistids.append(a['name'])
                    f.write(f"{userid},{len(artistids)-1},{int(a['playcount'])}\n")
                else:
                    f.write(f"{userid},{list(artistids).index(a['name'])},{int(a['playcount'])}\n")

### Helper Functions for Web Scraping

Opening a client, cleaning web data, etc. 

In [16]:
def openLastfmClient(user_, password_):
    # Opens a session with the last.fm website to allow for 
    # username collect (login protected)
    
    payload = payload = {
        "username_or_email": user_,
        "password": password_
    }
    loginurl = "https://www.last.fm/login"
    
    client = requests.Session()
    client.get(loginurl)
    
    if 'csrftoken' in client.cookies:
        # Django 1.6 and up
        csrftoken = client.cookies['csrftoken']
    else:
        # older versions
        csrftoken = client.cookies['csrf']
    payload["csrfmiddlewaretoken"] = csrftoken
    client.headers.update({'referer': loginurl})
    client.post(loginurl, data=payload)
    return client

In [17]:
def getListeners_artist(artist, client, pages=[1,2,3]):
    # Gets a list of the first 3 pages of top listener usernames of the artist
    
    users = []
    for p in pages:
        url = f"https://www.last.fm/music/{artist}/+listeners?page={p}"
        resp = client.get(url)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, 'html.parser')
            us = soup.findAll("h3", {"class": "top-listeners-item-name"})
            users += [u.find("a").text for u in us]
    return users

# Scraping the last.fm API and website for usernames via top artists

In [32]:
client = openLastfmClient(s["USER_AGENT"],s["PASSWORD"])    
artists  = getTopArtists(n=1000)
my_artists = np.array(getTopArtists_user("aidan_bbq"))
explore = my_artists[~np.isin(my_artists,artists)]

with open("./output/toplisteners_me.csv", "a", encoding="utf-8") as f:
    for a in explore:
        print(f"on artist: {a}")
        [f.write(f"{a},{u}\n") for u in getListeners(a,client)]

AttributeError: 'dict' object has no attribute 'USER_AGENT'

# Scraping last.fm directly for usernames via neighborhood searches

This is the first method implemented, described above in the methods section.

In [15]:
def getNeighbors(username):
    # Given a username, use lastfm's "neighbours" page to find new usernames.
    # Usernames are inherently clustered by similarity here - which is why 
    # top artists are used to mix up the dataset
    
    url = f"https://www.last.fm/user/{username}/neighbours"
    resp = requests.get(url)    
    
    if resp.status_code==200:
        soup = BeautifulSoup(resp.text, 'html.parser')
        us = soup.find("section", {"class": "neighbours-items-section"})
        if us is not None:
            us = us.findAll("a", {"class": "user-list-link link-block-target"})
            return np.asarray([u.text for u in us])
        
    return np.array([])

In [18]:
getNeighbors("aidan_bbq")

array(['Benthorn13', 'IvanMalison', 'eIliottsmithfan', 'jimmyyyhill',
       'emilynadeau', 'talia-hayes', 'Samotsupertramp', 'julietsvor',
       'alexhemley', 'Ezradepreum', 'keishaemeryy', 'festus08',
       'Max_langlinais', 'sarinasabouri', 'itsmeurlitllev1', 'tamsennn',
       'driedlilacs', 'sadiemasket', 'shhadam', 'cleo1233',
       'arthurbaker999', 'lilyfisher5', 'St0Ing', 'arthurbaker',
       'JanaHeylen', 'jaggie_mones', 'HugeSalad', 'sophiepeterson',
       'tomsentaylor', 'woodsgwen3', 'Ch3wbaccaw0k', 'michaelspeights',
       'Ffeelixx', 'celinehu0402', 'anawinston', 'jannafranco', 'Kosa12',
       'turnip1756', 'chrlwhtng', 'cud1995', 'owenheptinstall',
       'maxdotorg7', 'eviesteinwright', 'max_wolfson', 'Coomberlane',
       'myheartyearns', 'kanaan_', 'stanspruyt', 'Gracismart', 'janeybug'],
      dtype='<U15')

### Recursively following neighbors

In [22]:
datafile = "./output/toplisteners_me.csv"
with open(datafile, 'r', encoding="utf-8") as f:
    lines = [[node for node in re.split('[,](?!\s)',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']
    topls = pd.DataFrame(lines, columns = ["artist", "user"])

In [23]:
userheads = topls.user
searchstack = deque(userheads)
userset = set(userheads)
explored = np.array([]) 
useredges = None # to make graph of users

In [25]:
edges = None
datafile = "./output/both-neighs.csv"
with open(datafile, 'r') as f:
    lines = [[node for node in re.split(',',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']
    edges = np.array(lines)

In [26]:
tot_userset = set(edges.flatten())
tot_explored = np.array(set(edges[:,0]))
explored = tot_explored

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [27]:
newf = open("./output/new-neighs.csv", "a", encoding="utf-8")
totf = open("./output/both-neighs.csv", "a", encoding="utf-8")

In [28]:
numnew10 = deque([11]) # number of new users added in the past 10 iterations
while sum(numnew10) > 10  and len(searchstack) > 0:
    curruser = searchstack.popleft()
    if np.isin(curruser, explored):
        continue
    explored = np.append(explored, curruser)

    neighs = getneighbors(curruser)
    prevlen = len(userset)
    userset = userset.union(neighs)
    
    # stoppage conditions
    numnew10.append(len(userset) - prevlen)
    if len(numnew10) > 10:
        numnew10.popleft()
        
    # keeping track of overall additions
    tot_prevlen = len(tot_userset)
    tot_userset = tot_userset.union(userset)
    print(f"explored {curruser}, found {numnew10[-1]} new users, {len(tot_userset) - tot_prevlen} totally new, {len(tot_userset)} total")
    

    edges = np.asarray([np.repeat(curruser, len(neighs)), neighs]).T
    [newf.write(f"{e[0]},{e[1]}\n") for e in edges]
    [totf.write(f"{e[0]},{e[1]}\n") for e in edges]
    
    searchstack.extend(neighs[~np.isin(neighs, explored)]) # get rid of already visited neighs, add rest to searchstack

In [None]:
totf.close()
newf.close()