In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Group Project 2
# Find as many artist wiki pages as possible
# Use Artist table for list of artists, save detected Wikipedia page links in Wikilinks table

## Imports

In [2]:
# SQLite Database related imports
import sqlalchemy
from sqlalchemy.engine import Engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, event, inspect

In [3]:
# Scraping related imports
from bs4 import BeautifulSoup
import requests
import json
import re
import time

## Initializations

In [4]:
# Turn on PRAGMA foreign_keys to enforce foregn key constraints (it is disabled by default in SQLite)
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()

# Create engine to access the database
engine = create_engine("sqlite:///../data/CanadaTop100v2.sqlite")

# Reflect an existing database into a new model
AutomapBase = automap_base()

# Reflect the tables
AutomapBase.prepare(engine, reflect=True)

# Save references to each table
Artist = AutomapBase.classes.Artist

# Create our session (link) from Python to the DB
db_session = Session(engine)

# Debug output
#AutomapBase.classes.items()
#list(inspect(Artist).columns)

# Prepare for massive quering of Wikipedia 
session = requests.Session()

[Column('id', INTEGER(), table=<Artist>, primary_key=True, nullable=False),
 Column('name', TEXT(), table=<Artist>, nullable=False),
 Column('is_band', INTEGER(), table=<Artist>),
 Column('genre', TEXT(), table=<Artist>),
 Column('image', TEXT(), table=<Artist>),
 Column('wiki', TEXT(), table=<Artist>),
 Column('dob', TEXT(), table=<Artist>),
 Column('origin', TEXT(), table=<Artist>)]

## Helper functions v2

In [5]:
# Returns list of URLs to Wikipedia pages that possibly match our artist
# Parameters:
#    session - requests session to use for quering Wikipedia API
#    name - artist name
# Returns a posibly empty list of URLs to Wikipedia pages
# Checks URL suffixes for possible match
url_tails = re.compile(r"_\(musician|singer|band|rapper|DJ\)$")
api_url = "https://en.wikipedia.org/w/api.php"
params = {
    "action": "opensearch",
    "namespace": "0",
    "search": "",
    "limit": "15",
    "format": "json"
}

def get_wiki_urls(session, name):
    params['search'] = name
    response = session.get(url=api_url, params=params)
    data = response.json()
    url_list = data[3]
    if not len(url_list):
        return []

    result = [url_list[0]] # first URL should be the most relevant
    for url in url_list[1:]:
        if url_tails.search(url):
            result.append(url)

    return result

In [6]:
# Tries to detect if a page is about a musician or a band
# Returns True if keyword match detected
keywords = re.compile("Musician|Singer|Rapper|Members|Label", flags=re.IGNORECASE)

def is_good(soup):
    infobox = soup.find('table', class_="vcard")
    if not infobox:
        return False

    for s in infobox.strings:
        if keywords.match(s):
            return True
    
    return False

In [7]:
# Iterate through the URL list and try to detect if it a page we're looking for
# If the page is detected its url and soup object are returned, otherwise return None
def get_wiki_page(session, url_lst):
    for url in url_lst:
        response = session.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        if is_good(soup):
            return url, soup

    return None, None

In [8]:
# Function updates Artist table with scraped data
def update_artist(artist_id, url, data):
    db_session.query(Artist).\
        filter_by(id = artist_id).\
        update({
            'is_band' : data.get('is_band'),
            'genre' : data.get('genre'),
            'image' : data.get('image'),
            'wiki' : url,
            'dob' : data.get('dob'),
            'origin' : data.get('origin')
        })
    db_session.commit()

In [9]:
# Funcion parses Genre info
genre_template = re.compile(r"[\w][-&/ \w]*")

def parse_genre(td):
    for s in td.strings:
        g = genre_template.match(s)
        if g: return g[0]
    return None

In [10]:
# Funcion parses Origin info
origin_template = re.compile(r"[\w, ]")

def parse_origin(td):
    origin = ""
    for s in td.strings:
        if origin_template.match(s):
            origin += s
    if len(origin):
        return origin
    else:
        return None

In [11]:
# Funcion parses Birth info (date and place)
def parse_born(td):
    # Handle birth date
    bday = None
    span = td.find('span', class_='bday')
    if span:
        bday = str(span.string)

    # Handle Birth place
    bplace = ""
    td_brs = td.find_all('br') # May be 2 <br> - 1st after birth name,
    if td_brs:                 # 2nd after birth date. Birth name may be missing, so only 1 <br> is possible
        td_br = td_brs[-1]     # We look for the last <br>
    else:
        td_br = None
        
    if td_br and td_br.next_siblings:
        for sib in td_br.next_siblings:
            try:
                for s in sib.strings:
                    if origin_template.match(s):
                        bplace += s
            except:
                if origin_template.match(sib):
                    bplace += sib

    if len(bplace) == 0:
        bplace = None # We need NULL in the table, and "" isn't NULL, only None is NULL
    
    return bday, bplace

In [12]:
# Based on 'scrape_wikipedia' by Leah Lindy
# Refactored to accomodate new interface and improve search of data
header_genre  = re.compile("Genres?$", re.I)
header_born   = re.compile("Born$", re.I)
header_origin = re.compile("Origin$", re.I)
header_member = re.compile("Members?$", re.I)

def soup_to_data(soup): 
    # Initialize columns
    is_band = None
    genre = None
    image = None
    dob = None
    origin = None
    pob = None # Sometimes they give place of birth and origin separately
    
    # Step 1: Get Info Table
    artist_table = soup.find('table', class_='infobox')

    # Step 2: Find image
    table_img = artist_table.find('img')
    if table_img:
        image = 'https:' + table_img['src']

    # Steps 3 and so on: Iterate through table rows and try to squeeze useful data from them
    headers = artist_table.find_all('th')
    for th in headers:
        if header_genre.match(str(th.string)):
            genre = parse_genre(th.next_sibling)
            if genre: # it is a musician, but we don't know if it's a person or band.
                is_band = 0 # 'Members' section goes after 'Genres'. If it is there, band it is
        elif header_born.match(str(th.string)):
            dob, pob = parse_born(th.next_sibling)
            if dob: is_band = 0 # Sometimes DOB isn't recognized, e.g. https://en.wikipedia.org/wiki/Saint_Jhn
        elif header_origin.match(str(th.string)):
            origin = parse_origin(th.next_sibling)
        elif header_member.match(str(th.string)):
            is_band = 1

    # Handle situation when they give place of birth but don't give Origin
    # If they give both prefer Origin over place of birth
    if origin is None:
        origin = pob

    return {
        'is_band': is_band,
        'genre': genre,
        'image': image,
        'dob': dob,
        'origin': origin
    }


In [13]:
# Scrapes wiki page and saves scraped data into Artist table
# Parameters:
#    artist_id - artist/band id in the Artist table
#    url - link to Wikipedia page to save in the Artist table
#    soup - page contents to scrape
# Returns True if success, false otherwise
def scrape_wiki(artist_id, url, soup):
    data = soup_to_data(soup)
    if not data:
        return False
    else:
        update_artist(artist_id, url, data)
        return True

## Page Search

In [None]:
# Select all artists that haven't been scraped yet
artists = db_session.query(Artist.id, Artist.name).filter_by(wiki=None).all()

# Prepare for massive quering of Wikipedia 
session = requests.Session()

# Main loop 
for artist_res in artists:
    artist_dct = artist_res._asdict()
    artist_id = artist_dct['id']
    name = artist_dct['name']
    print(artist_id, end="")

    # Get list of URLs to Wikipedia pages
    url_lst = get_wiki_urls(session, name)
    if len(url_lst) == 0:
        print("- ", end="")
        continue

    # Find a page that looks like what we need
    url, soup = get_wiki_page(session, url_lst)
    if soup is None:
        print("- ", end="")
        continue

    # Scrape the page and update DB
    res = scrape_wiki(artist_id, url, soup)
    if res:
        print("+ ", end="")
    else:
        print("- ", end="")

print("\nDone.")

## Load data from manually added Wikipedia links

In [None]:
artists = db_session.query(Artist.id, Artist.wiki).filter_by(is_band = None).filter(Artist.wiki != None).all()

# Main loop 
for artist_res in artists:
    artist_dct = artist_res._asdict()
    artist_id = artist_dct['id']
    url = artist_dct['wiki']
    print(artist_id, end="")

    # Find a page that looks like what we need
    url, soup = get_wiki_page(session, [url])
    if soup is None:
        print("- ", end="")
        continue

    # Scrape the page and update DB
    res = scrape_wiki(artist_id, url, soup)
    if res:
        print("+ ", end="")
    else:
        print("- ", end="")

print("\nDone.")

# Sandbox

In [14]:
import requests
session = requests.Session()

In [15]:
url = 'https://en.wikipedia.org/wiki/Tones_and_I'
response = session.get(url)
soup = BeautifulSoup(response.text, 'lxml')
artist_table=soup.find('table', class_='infobox')

In [21]:
headers = artist_table.find_all('th')
td = None
is_band = None
for th in headers:
    if re.match("Genre", str(th.string), re.I):
        genre = parse_genre(th.next_sibling)
        print("Genre:", genre)
        is_band = 0
    elif re.match("Born", str(th.string), re.I):
        dob, pob = parse_born(th.next_sibling)
        print("DOB:", dob, " POB:", pob)
        if dob: is_band = 0
    elif re.match("Origin", str(th.string), re.I):
        origin = parse_origin(th.next_sibling)
        print("Origin:", origin)
    elif re.match("Member", str(th.string), re.I):
        print("is band")
        is_band = 1
print(f"is_band={is_band}")

DOB: None  POB: None
Origin: Frankston, Victoria, Australia
Genre: Pop
is_band=0
