# GEOG 464 - Term Project - Cameron Brubacher

## An interactive map of Canadian literature

### Importing modules:

In [1]:
import pandas as pd
import bs4 as bs

### Importing data:

In [2]:
articles = pd.read_csv('data/delivery.csv')
html = open('data/EBSCOhost.html')

### Cleaning article data:

In [3]:
def clean_art(dataset):
    dataset.drop(columns={'ISSN', 'Publication Date', 'Issue', 'DOI', 'Doctype', 'Keywords', 'Abstract'}, inplace=True)
    dataset.rename(columns={'Author': 'Journal Author', 'Publisher': 'Journal Publisher', 'PLink': 'Permalink'}, inplace=True)
    dataset['Work Type'] = pd.Series(dtype='string')
    dataset['Work Genre'] = pd.Series(dtype='string')
    for i, row in dataset.iterrows():
        dataset.loc[i, 'ISBN'] = row[3][2:-1]
        dataset.loc[i, 'Work Type'] = row[0][((row[0].find(';'))+2):(row[0].rfind(';'))]
        dataset.loc[i, 'Work Genre'] = row[0][((row[0].rfind(';'))+2):]
    return dataset

### Parsing HTML files:

In [4]:
def parse_html(file):
    div = bs.SoupStrainer('div', {'id': 'records'})
    html = bs.BeautifulSoup(file, 'html.parser', parse_only=div)
    record = []
    for string in html.stripped_strings:
        record.append(repr(string))
    return record

### Cleaning HTML data:

In [5]:
def clean_html(dataset):
    dataset = [entry.strip("'") for entry in dataset]
    remove = {'underbar', 'Entry', 'publication', 'cross references'}
    for i, entry in enumerate(dataset):
        if len(entry) <= 3:
            del(dataset[i])
        for string in remove:
            if string in entry:
                del(dataset[i])
    return dataset 

### Tabulating metadata:

In [6]:
def table_meta(dataset):
    headers = []
    for entry in dataset:
        if entry.endswith(':'):
            headers.append(entry[:-1])
    meta = pd.DataFrame(data=[[None]*len(headers)], columns=headers)
    for i, entry in enumerate(dataset):
        for header in headers:
            try:
                if header in entry and not dataset[i+1].endswith(':'):
                    meta[header] = dataset[i+1]
            except IndexError:
                break
    return meta

### Cleaning metadata:

In [7]:
def clean_meta(dataset):
    dataset.drop(columns={'Title', 'Subject(s)', 'Links', 'Linking Note', 'Book Source', 'Source', 'Item Number', 'Cut and Paste', 'Persistent link to this record (Permalink)'}, inplace=True)
    dataset.rename(columns={'Other Title': 'Part Title', 'Author(s)': 'Series Author', 'Genre(s)': 'Series Genre', 'Authors discussed': 'Work Author'}, inplace=True)
    for i, row in dataset.iterrows():
        author = row[5][:(row[5].find(';'))]
        author = author[((author.find(','))+2):]+' '+author[:(author.find(','))]
        dataset.loc[i, 'Work Author'] = author.title()
        dataset.loc[i, 'Series Title'] = row[2][1:-1]
    return dataset

### Tabulating citation data:

In [8]:
def table_cit(dataset):
    entries = []
    for entry in dataset:
        if entry.startswith('A') and 'xa' in entry:
            entries.append(entry.rsplit('\\'))
    works = []
    for entry in entries:
        for string in entry:
            if '.' in string:
                works.append(string)
    cits = []
    for entry in works:
        cits.append(entry.rsplit('.'))
    headers = ['Work Title', 'Work Publisher', 'Publisher City', 'Year', 'Pages']
    cit = pd.DataFrame(columns=headers)
    for entry in cits:
        row = [entry[0], entry[1], None, None, entry[2]]
        cit = pd.concat([cit, pd.DataFrame([row], columns=cit.columns)], ignore_index=True)
    return cit


### Cleaning citation data:

In [9]:
def clean_cit(dataset):
    for i, row in dataset.iterrows():
        dataset.loc[i, 'Work Title'] = row[0][((row[0].rfind('0'))+1):]
        dataset.loc[i, 'Publisher City'] = row[1][1:(row[1].rfind(':'))]
        year = row[1][((row[1].rfind(','))+2):]
        if '[' in year:
            year = year[1:-1]
        dataset.loc[i, 'Year'] = year
        dataset.loc[i, 'Work Publisher'] = row[1][((row[1].rfind(':'))+2):(row[1].rfind(','))]
    return dataset

### Combining data:

In [11]:
def combine(articles, metadata, citations):
    metadata = pd.concat([metadata]*(len(citations)), ignore_index=True)
    metadata = metadata.join(citations)
    number = metadata.loc[0, 'Accession Number']
    citations = articles.loc[articles['Accession Number'] == number]
    works = citations.merge(metadata, on='Accession Number')
    works = works[['Database', 'Record Type', 'Series Genre', 'Series Title', 'Volume', 'Journal Title', 'Journal Publisher', 'Journal Author', 'Series Author', 'First Page', 'Page Count', 'Article Title', 'Part Title', 'Topics', 'Subjects', 'Work Author', 'Work Type', 'Work Genre', 'Work Title', 'Year', 'Pages', 'Work Publisher', 'Publisher City', 'ISBN', 'Accession Number', 'Permalink']]
    return works

### Calling functions:

In [12]:
articles = clean_art(articles)
record = parse_html(html)
record = clean_html(record)
metadata = table_meta(record)
metadata = clean_meta(metadata)
citations = table_cit(record)
citations = clean_cit(citations)
works = combine(articles, metadata, citations)