In [1]:
# UT-TOR-DATA-PT-01-2020-U-C Group Project 2
# Find as many artist wiki pages as possible
# Use Artist table for list of artists, save detected Wikipedia page links in Wikilinks table

## Imports

In [2]:
# SQLite Database related imports
import sqlalchemy
from sqlalchemy.engine import Engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, event, inspect

In [3]:
# Scraping related imports
from bs4 import BeautifulSoup
import requests

In [4]:
import time

## Initializations

### DB access initialization

In [None]:
# Turn on PRAGMA foreign_keys to enforce foregn key constraints (it is disabled by default in SQLite)
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()

# Create engine to access the database
engine = create_engine("sqlite:///../data/CanadaTop100.sqlite")

# Reflect an existing database into a new model
AutomapBase = automap_base()

# Reflect the tables
AutomapBase.prepare(engine, reflect=True)

# Save references to each table
Artist = AutomapBase.classes.Artist
WikiLinks = AutomapBase.classes.WikiLinks

# Create our session (link) from Python to the DB
session = Session(engine)

# Debug output
#AutomapBase.classes.items()
list(inspect(Artist).columns)

## Helper functions

In [6]:
# Tries to detect if a page is about a musician or a band
# Returns True if the page is detected as about a musician or a band
def check_url(url):
    time.sleep(1.5)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    infobox = soup.find('table', class_="vcard")
    if not infobox:
        return False
    
    genres = infobox.tbody.find('th', string='Genres', scope='row')
    return genres is not None


## Page Search

In [14]:
# Select all artists and loop through them trying to find Wikipedia page
checked_ids = session.query(WikiLinks.artist_id)

artists = session.query(Artist.id, Artist.name).\
    filter(Artist.id.notin_(checked_ids)).\
    order_by(Artist.id).\
    all()

# Suffixes
suffixes = ['', '_(musician)', '_(band)', '_(rapper)']

In [None]:
for artist_res in artists:
    artist_dct = artist_res._asdict()

    # Prepare the name for URL usage
    name = artist_dct['name'].replace(' ', '_')
    artist_id = artist_dct['id']
    print(artist_id, end="")
    
    # Try different variants of article names
    found_url = False
    for sfx in suffixes:
        url = 'https://en.wikipedia.org/wiki/' + name + sfx
        found_url = check_url(url)
        if found_url:
            print("+ ", end="")
            break

    if not found_url:
        print("- ", end="")
        url = None
    
    # Save the result
    session.add(WikiLinks(artist_id=artist_id, url=url))
    session.commit()

print("\nDone")

In [22]:
# Second take
# Suffixes
suffixes = ['_(rapper)']

# Select all artists and loop through them trying to find Wikipedia page
undetected = session.query(WikiLinks.artist_id).filter(WikiLinks.url == None)

artists = session.query(Artist.id, Artist.name).\
    filter(Artist.id.in_(undetected)).\
    order_by(Artist.id).\
    all()

In [None]:
for artist_res in artists:
    artist_dct = artist_res._asdict()

    # Prepare the name for URL usage
    name = artist_dct['name'].replace(' ', '_')
    artist_id = artist_dct['id']
    print(artist_id, end="")
    
    # Try different variants of article names
    found_url = False
    for sfx in suffixes:
        url = 'https://en.wikipedia.org/wiki/' + name + sfx
        found_url = check_url(url)
        if found_url:
            print("+ ", end="")
            break

    if not found_url:
        print("- ", end="")
        url = None
    
    # Save the result
    session.query(WikiLinks).\
        filter(WikiLinks.artist_id == artist_id).\
        update({ "url" : url })
    session.commit()

print("\nDone")