# GEOG 464 - Term Project - Cameron Brubacher

## An interactive map of Canadian literature

### Importing modules:

In [13]:
import pandas as pd
import bs4 as bs
import re
import wikipedia as wiki
import geocoder as geo
import geopandas as gpd

### Importing data:

In [2]:
def import_data(articles_file, text_file):
    # Import csv with articles from library database:
    articles = pd.read_csv(articles_file)

    # Import text version of html file with full-text citations from library database:
    with open(text_file, 'r') as file:
        lines = file.readlines()
        record = [line.strip() for line in lines]
    return articles, record

### Cleaning article data:

In [3]:
def clean_art(dataset):
    # Remove unnecessary/empty columns:
    dataset.drop(columns={'ISSN', 'Publication Date', 'Issue', 'DOI', 'Doctype', 'Keywords', 'Abstract'}, inplace=True)
    
    # Rename ambiguous columns:
    dataset.rename(columns={'Author': 'Journal Author', 'Publisher': 'Journal Publisher', 'PLink': 'Permalink'}, inplace=True)
    
    # Reformat article titles so all begin with 'Part n:':
    #for i, row in dataset.iterrows():
    #    if row[0][4] != ' ':
    #        dataset.loc[i, 'Article Title'] = row[0][:4]+' '+row[0][4:]   
    for i, row in dataset.iterrows():
        if row[0][6] != ':':
            dataset.loc[i, 'Article Title'] = row[0][:6]+':'+row[0][6:]
    
    # Reformat ISBN:
    for i, row in dataset.iterrows():
        dataset.loc[i, 'ISBN'] = row[3][2:-1]
    
    # Create empty work type and work sub-type columns:
    dataset['Work Type'] = pd.Series(dtype='string')
    dataset['Work Sub-Type'] = pd.Series(dtype='string')
    
    # Assign work type and work sub-types based on content of article titles:
    for i, row in dataset.iterrows():
        if row[0].count(';') >= 2:
            dataset.loc[i, 'Work Type'] = row[0][((row[0].find(';'))+2):(row[0].rfind(';'))]
            dataset.loc[i, 'Work Sub-Type'] = row[0][((row[0].rfind(';'))+2):]
        elif (row[0].count(';') == 1) and (row[0].count(':') > 1):
            dataset.loc[i, 'Work Type'] = row[0][((row[0].find(';'))+2):row[0].rfind(':')]
            dataset.loc[i, 'Work Sub-Type'] = row[0][((row[0].rfind(':'))+2):]
        elif row[0].count(';') == 1:
            dataset.loc[i, 'Work Type'] = row[0][((row[0].rfind(';'))+2):]
        elif row[0].count(':') > 2:
            dataset.loc[i, 'Work Type'] = row[0][((row[0].find(':'))+2):(row[0].rfind(':'))]
            dataset.loc[i, 'Work Sub-Type'] = row[0][((row[0].rfind(':'))+2):]
        elif (row[0].count(';') == 0) and (row[0].count(':') == 1):
            dataset.loc[i, 'Work Type'] = row[0][((row[0].rfind(':'))+2):]
    return dataset

### Cleaning record data:

In [4]:
def clean_record(dataset):
    # Remove entries based on length:
    remove = {'Entry', 'publication', 'cross references', 'Copyright', 'Accessibility', 'Back' 'saved', 'Internet Explorer'}
    delete = []
    for i, entry in enumerate(dataset):
        if len(entry) <= 3:
            delete.append(i)
    for n in sorted(delete, reverse=True):
        del dataset[n]
    return dataset

### Tabulating metadata:

In [5]:
def table_meta(dataset):
    # Create list of headers based on sub-record metadata:
    headers = []
    for entry in dataset:
        if entry.endswith(':'):
            headers.append(entry)
    
    # Create empty metadata dataframe with list of headers:
    meta = pd.DataFrame(data=[[None]*len(headers)], columns=headers)

    # Add metadata to dataframe:
    for i, entry in enumerate(dataset):
        for header in headers:
            try:
                if (header == entry) and (dataset[i+1].endswith(':') == False):
                    meta[header] = dataset[i+1]
            except IndexError:
                break
    return meta

### Cleaning metadata:

In [6]:
def clean_meta(dataset):
    # Remove unnecessary/empty columns:
    remove = {'Other Title:', 'Links:'}
    for column in dataset.columns:
        for string in remove:
            if string == column:
                dataset.drop(columns=column, inplace=True)

    # Rename columns:
    dataset.rename(columns={'Title:': 'Part Title', 'Record Type:': 'Record Type', 'Series Title:': 'Series Title', 'Author(s):': 'Series Author', 'Genre(s):': 'Series Genre', 'Subject(s):': 'Work Author', 'Book Source:': 'Source', 'Accession Number:': 'Accession Number', 'Database:': 'Database'}, inplace=True)
    
    # Reformat work author and series title columns:
    for i, row in dataset.iterrows():
        try:
            author = dataset.loc[i, 'Work Author'][((dataset.loc[i, 'Work Author'].find(':'))+2):(dataset.loc[i, 'Work Author'].find(';'))]
            author = author[((author.find(','))+2):]+author[:(author.find(','))]
            dataset.loc[i, 'Work Author'] = author.title()
        except KeyError:
            pass
    return dataset

### Adding geographic and biographic information to metadata:

In [7]:
def geo_bio(dataset):
    # Create column for author's birthplace:
    dataset['Author Biography'] = pd.Series(dtype='string')
    dataset['Author Birthplace'] = pd.Series(dtype='string')
    dataset['latitude'] = pd.Series(dtype='float')
    dataset['longitude'] = pd.Series(dtype='float')

    for i, row in dataset.iterrows():
        # Extract author's name:
        author = dataset.loc[i, 'Work Author']
        #print(author)

        # Add short biography to dataframe:
        try:
            dataset.loc[i, 'Author Biography'] = wiki.summary(author, sentences=1, auto_suggest=True, redirect=True)
        except:
            pass

        # Add birthplace to dataframe:
        if author == 'Alice Munro':
            dataset.loc[i, 'Author Birthplace'] = 'Wingham, Ontario'
        if author == 'Anne Hebert':
            dataset.loc[i, 'Author Birthplace'] = 'Sainte-Catherine-de-la-Jacques-Cartier, Quebec'
        if author == 'Earle Birney':
            dataset.loc[i, 'Author Birthplace'] = 'Calgary, Alberta'
        if author == 'Ernest Buckler':
            dataset.loc[i, 'Author Birthplace'] = 'West Dalhousie, Nova Scotia'
        if author == 'Ethel Wilson':
            dataset.loc[i, 'Author Birthplace'] = 'Vancouver, British Columbia'
        if author == 'Gabrielle Roy':
            dataset.loc[i, 'Author Birthplace'] = 'Saint Boniface, Manitoba'
        if (author == 'Hugh Hood') or (author == 'Marian Engel') or (author == 'Morley Callaghan'):
            dataset.loc[i, 'Author Birthplace'] = 'Toronto, Ontario'
        if author == 'Margaret Atwood':
            dataset.loc[i, 'Author Birthplace'] = 'Ottawa, Ontario'
        if author == 'Margaret Laurence':
            dataset.loc[i, 'Author Birthplace'] = 'Neepawa, Manitoba'
        if (author == 'Mavis Gallant') or (author == 'Michael Ondaatje') or (author == 'Mordecai Richler'):
            dataset.loc[i, 'Author Birthplace'] = 'Montreal, Quebec'
        if author == 'Patricia K. Page':
            dataset.loc[i, 'Author Birthplace'] = 'Red Deer, Alberta'
        if author == 'Robert Kroetsch':
            dataset.loc[i, 'Author Birthplace'] = 'Heisler, Alberta'
        if author == 'Robertson Davies':
            dataset.loc[i, 'Author Birthplace'] = 'Thamesville, Ontario'
        if author == 'Sinclair Ross':
            dataset.loc[i, 'Author Birthplace'] = 'Shellbrook, Saskatchewan'
        if author == 'Thomas Raddall':
            dataset.loc[i, 'Author Birthplace'] = 'Halifax, Nova Scotia'
        if author == 'W.O. Mitchell':
            dataset.loc[i, 'Author Birthplace'] = 'Weyburn, Saskatchewan'
        if author == 'Leonard Cohen':
            dataset.loc[i, 'Author Birthplace'] = 'Westmount, Quebec'
        
        # Extract author's birthplace:
        place = dataset.loc[i, 'Author Birthplace']

        # Geocode coordinates of birthplace:
        location = geo.osm(place)
        latlng = location.latlng

        # Add coordinates to dataframe:
        dataset.loc[i, 'latitude'] = latlng[0]
        dataset.loc[i, 'longitude'] = latlng[1]
    return dataset

### Tabulating citation data:

In [8]:
def table_cit(dataset):
    # Create list of bibliographic entries:
    works = []
    for entry in dataset:
        if re.search('^A\d+', entry):
            works.append(entry)
    for i, entry in enumerate(works):
        works[i] = entry.lstrip('A0123456789 ')
    for i, entry in enumerate(works):
        #print(entry)
        if re.search('^[a-z]', entry):
            works[i] = 'A'+entry
    
    # Split entries by and append to works list:
    cits = []
    for i, entry in enumerate(works):
        row = entry.rsplit('.')
        row = row[:-1]
        for i, string in enumerate(row):
            row[i] = string.strip()
        cits.append(row)
    return cits

### Cleaning citation data:

In [9]:
def clean_cit(dataset):
    headers = ['Work Title', 'Year', 'Pages', 'Work Publisher', 'Publisher City', 'Notes']
    cit = pd.DataFrame(index = range(len(dataset)), columns = headers)
    for i, entry in enumerate(dataset):   
        notes = ""
        for i2, string in enumerate(entry):
            if i2 == 0:
                cit.loc[i, 'Work Title'] = string
            elif (':' in string) and (',' in string):
                cit.loc[i, 'Publisher City'] = string[:(string.find(':'))] 
                cit.loc[i, 'Work Publisher'] = string[((string.find(':'))+2):(string.rfind(','))] 
                cit.loc[i, 'Year'] = string[((string.rfind(','))+2):]     
            elif 'pp' in string:
                cit.loc[i, 'Pages'] = string
            else:
                notes = notes+'. '+string
            cit.loc[i, 'Notes'] = notes
    for i, row in cit.iterrows():
        try:
            if len(cit.loc[i, 'Notes']) < 1:
                cit.loc[i, 'Notes'] = None
            else:
                cit.loc[i, 'Notes'] = cit.loc[i, 'Notes'][2:]
        except KeyError:
            pass
    return cit

### Calling functions:

In [10]:
# Import citation and full-text html data:
articles, record = import_data('data/articles.csv', 'data/EBSCOhost.txt')

# Clean article and HTML record data:
articles = clean_art(articles)
record = clean_record(record)

# Create list of split points (at 'Record: n') in HTML record data:
splits = []
for i, entry in enumerate(record):
    if 'Record:' in entry:
        splits.append(i)

# Create empty metadata, citation, and joined dataframes:
metadata = pd.DataFrame()
citations = pd.DataFrame()
metacit = pd.DataFrame()

# Iterate through the HTML record data:
for i, n in enumerate(splits):
    # Create sub-record for each group of entries in HTML record data:
    sub_rec = []
    try:
        sub_rec = record[n:(splits[i+1])]
    except IndexError:
        sub_rec = record[n:]
        
    # Tabulate and clean sub-record metadata:
    meta = table_meta(sub_rec)
    meta = clean_meta(meta)
    meta = geo_bio(meta)

    # Append sub-record metadata to metadata dataframe:
    metadata = pd.concat([metadata, meta], ignore_index=True)

    # Tabulate and clean sub-record citation data:
    citlist = table_cit(sub_rec)
    cit = clean_cit(citlist)

    # Append sub-record citation data to citation dataframe:
    citations = pd.concat([citations, cit], ignore_index=True)

    # Append sub-record citation data to sub-record metadata:
    metas = pd.concat([meta]*(len(cit)), ignore_index=True)
    submetacit = metas.join(cit)

    # Append sub-record metadata and citation data to joined dataframe:
    metacit = pd.concat([metacit, submetacit], ignore_index=True)

# Merge articles dataframe to join metadata-citation dataframe to creation master dataframe:
master = articles.merge(metacit, on='Accession Number')

In [14]:
# Convert master dataframe to geodataframe:
geometry = gpd.points_from_xy(master.longitude, master.latitude)
master_gdf = gpd.GeoDataFrame(master, geometry=geometry)