1. Create a flask-based website that provides a query interface to BibTeX
bibliographic data.

The website must allow a user to upload a BibTeX file,
store the contents of the file in a database, and provide a query interface to the
database.

Upon insertion, each set of bibliography entries (contained within the “.bib”
file) is a “collection”, and the collection name is provided by the user. 

This is
useful if a user has multiple BibTex files, possibly relating to different research
projects or pertaining to different fields of study.

The database must have columns for citation tag, author list, journal, volume,
pages, year, title, and collection. 

The website provides a query interface by
passing user-entered SQL statements to sqlite3.

2. Get your (public) GitHub repo setup with Travis CI and make sure that you
have at least 70% code coverage.

### Stuff below: how to upload files. Copied from flask website.

In [1]:
%%writefile app.py
# https://flask.palletsprojects.com/en/2.1.x/patterns/fileuploads/
import os
import pandas as pd
import numpy as np
from flask import Flask, flash, request, redirect, url_for, send_from_directory, render_template
from werkzeug.utils import secure_filename
from pybtex.database import parse_file 
import sqlite3

"""
Useful examples: https://stackoverflow.com/questions/62004957/crating-an-if-statement-for-different-html-option-in-flask-python
"""

UPLOAD_FOLDER = '/home/jovyan/python-ay250-homeworks/hw_7/bibuploads'
ALLOWED_EXTENSIONS = {'bib'}

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def parse_bib_to_df(bibfile, collection_name):
    """
    Parse a bibtex file into a pandas dataframe.
    Keeps only the tag, author, journal, volume, pages, year,
    (For easy insertion into an SQL database later.)
    
    bibfile : a .bib file
    collection_name : string
    """
    bib_data = parse_file(bibfile, "bibtex")

    tag_list = []
    author_list = []
    journal_list = []
    volume_list = []
    pages_list = []
    year_list = []
    title_list = []
    collection_list = []

    for tag in list(bib_data.entries.keys()):
        if 'Author' in bib_data.entries[tag].persons:
            for author_name in list(bib_data.entries[tag].persons['Author']):
                author = author_name.last_names[0][1:-1]
                if 'journal' in bib_data.entries[tag].fields._dict:
                    journal = bib_data.entries[tag].fields._dict['journal'][1:]
                else:
                    journal = np.nan

                if 'volume' in bib_data.entries[tag].fields._dict: 
                    volume = int(bib_data.entries[tag].fields._dict['volume'])
                else:
                    journal = np.nan

                if 'pages' in bib_data.entries[tag].fields._dict: 
                    pages = bib_data.entries[tag].fields._dict['pages']
                else:
                    journal = np.nan

                if 'year' in bib_data.entries[tag].fields._dict: 
                    year = int(bib_data.entries[tag].fields._dict['year'])
                else:
                    journal = np.nan

                if 'title' in bib_data.entries[tag].fields._dict: 
                    title = bib_data.entries[tag].fields._dict['title'][1:-1]
                else:
                    journal = np.nan

                tag_list.append(tag)
                author_list.append(author)
                journal_list.append(journal)
                volume_list.append(volume)
                pages_list.append(pages)
                year_list.append(year)
                title_list.append(title)
                collection_list.append(collection_name)

    df = pd.DataFrame({'tag': tag_list, 'author': author_list,
                        'journal' : journal_list, 'volume' : volume_list,
                        'pages' : pages_list, 'year' : year_list,
                        'title' : title_list, 'collection' : collection_list})
    
    return df

def df_to_sql(df):
    # FIXME: SAVE THIS TO A PARTICULAR PLACE??
    connection = sqlite3.connect("bibliography.db")

    cursor = connection.cursor()

    sql_cmd = """CREATE TABLE bibliography 
                (iid INTEGER  NOT NULL  PRIMARY KEY  AUTOINCREMENT DEFAULT 0, 
                tag TEXT, 
                author TEXT, 
                journal TEXT, 
                volume FLOAT, 
                pages TEXT,
                year FLOAT,
                title TEXT,
                collection TEXT)"""

    cursor.execute(sql_cmd)
    connection.commit()
    
    df = df.where(pd.notnull(df), None)
    for ii, row in df.iterrows():
        iparams = (row['tag'], row['author'], row['journal'], row['volume'], 
                   row['pages'], row['year'], row['title'], row['collection'])
        sql_cmd = """INSERT INTO bibliography
                    (tag, author, journal, volume, pages, year, title, collection)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)"""

        cursor.execute(sql_cmd, iparams)
    connection.commit()
    return

@app.route('/', methods=['GET', 'POST'])
def start_page():
    if os.listdir(UPLOAD_FOLDER) == []:
        return render_template('start_empty.html', upload_file=url_for('upload_file'))

    else:
        return render_template('start_filled.html', 
                               upload_file=url_for('upload_file'), query_db=url_for('query_db'))

@app.route('/update', methods=['GET', 'POST'])
def upload_file():
    # FIXME: figure out how to save the collection name.
    # Also need to figure out these if statements and error stuff.
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # If the user does not select a file, the browser submits an
        # empty file without a filename.
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            collection_name = request.form['collname']
            filename = secure_filename(file.filename)
            bibfile = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            file.save(bibfile)
            
            # Save bib into dataframe
            bibdf = parse_bib_to_df(bibfile, collection_name)
            df_to_sql(bibdf)
            
            return redirect(url_for('start_page'))
    return render_template('upload_file.html')

    
@app.route('/query', methods=['GET', 'POST'])
def query_db():
    if request.method == 'POST':
        connection = sqlite3.connect("bibliography.db")
        cursor = connection.cursor()
        sql_cmd = request.form['query']
        cursor.execute(sql_cmd)

        db_info = cursor.fetchall()
        
        return render_template('display.html', len = len(db_info), db_info = db_info)

    return render_template('query.html')

@app.route('/uploads/<name>')
def download_file(name):
    return send_from_directory(app.config["UPLOAD_FOLDER"], name)

if __name__ == '__main__':
    app.run(port=8000, debug = True)

Overwriting app.py


In [None]:
!python app.py

 * Serving Flask app 'app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
 * Running on http://127.0.0.1:8000/ (Press CTRL+C to quit)
 * Restarting with stat
 * Debugger is active!
 * Debugger PIN: 450-013-370
[('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('1998A&A...330..515F',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2001AJ....122.2587C',), ('2007A&A...474..653V',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',), ('1994ApJ...427..628F',),

### Stuff below: just playing around with pybtex and making sure know how to read files.

In [1]:
from pybtex.database import parse_file 

bib_data = parse_file('/home/jovyan/python-seminar/Homeworks/hw_7/hw_7_data/homework_7_refs.bib', "bibtex")

In [2]:
# Get all citation tags.
list(bib_data.entries.keys())

['1998A&A...330..515F',
 '2001AJ....122.2587C',
 '2007A&A...474..653V',
 '1994ApJ...427..628F',
 '2006ASPC..351..751B',
 '1996A&AS..117..393B',
 '1997ApJ...486...60D',
 '1997ESASP.402..657S',
 '2003LNP...635..281R',
 '2003LNP...635..265W',
 '2003LNP...635..243C',
 '2003LNP...635..229G',
 '2003LNP...635..203H',
 '2003LNP...635..187S',
 '2003LNP...635..175P',
 '2003LNP...635..123K',
 '2003LNP...635..105C',
 '2003LNP...635...85B',
 '2003LNP...635...71S',
 '2003LNP...635...45F',
 '2003LNP...635...21F',
 '2003LNP...635....1M',
 '2008MNRAS.389.1336K',
 '1987ARA&A..25..345F',
 '2006ARA&A..44...93S',
 '2002AJ....124.1213D',
 '1978MNRAS.183..569D',
 '2008JPhCS.118a2010E',
 '1995AJ....110.1476K',
 '1908AnHar..60...87L',
 '1912HarCi.173....1L',
 '2001ApJ...553...47F',
 '1997ESASP1200.....P',
 '1997MNRAS.286L...1F',
 '1998A&A...335L..81L',
 '1997MNRAS.287..955V',
 '2006MNRAS.372.1675S',
 '2008arXiv0806.3019F',
 '2008ApJ...679...71F',
 '2008MNRAS.386.2115F',
 '1998salg.conf..263M',
 '2006MNRAS.370.

In [3]:
# For a given citation tag, get the author list.
bib_data.entries['1998A&A...330..515F'].persons['Author']
# Get just the last name of the entry.
#bib_data.entries['1998A&A...330..515F'].persons['Author'][0].last_names

[Person('{Fernley}, J.'),
 Person('{Barnes}, T. G.'),
 Person('{Skillen}, I.'),
 Person('{Hawley}, S. L.'),
 Person('{Hanley}, C. J.'),
 Person('{Evans}, D. W.'),
 Person('{Solano}, E.'),
 Person('{Garrido}, R.')]

In [6]:
# For a given citation tag, get the...
# journal
journal = bib_data.entries['1998A&A...330..515F'].fields._dict['journal']
print('journal: ')
print(journal)
print(type(journal))
# volume
volume = bib_data.entries['1998A&A...330..515F'].fields._dict['volume']
print('volume: ')
print(volume)
print(type(volume))
# pages
pages = bib_data.entries['1998A&A...330..515F'].fields._dict['pages']
print('pages: ')
print(pages)
print(type(pages))
# year
year = bib_data.entries['1998A&A...330..515F'].fields._dict['year']
print('year: ')
print(int(year))
print(type(int(year)))
# title
title = bib_data.entries['1998A&A...330..515F'].fields._dict['title']
print('title: ')
print(title)
print(type(title))
# collection: the user defines this.

journal: 
\aap
<class 'str'>
volume: 
330
<class 'str'>
pages: 
515-520
<class 'str'>
year: 
1998
<class 'int'>
title: 
{The absolute magnitudes of RR Lyraes from HIPPARCOS parallaxes and proper motions}
<class 'str'>


### STUFF BELOW: Framework for reading in .bib file into sqlite3 database.
Note to fix: the collection should be passed in by the user.

In [7]:
import pandas as pd
import numpy as np

bib_data = parse_file('/home/jovyan/python-seminar/Homeworks/hw_7/hw_7_data/homework_7_refs.bib', "bibtex")

tag_list = []
author_list = []
journal_list = []
volume_list = []
pages_list = []
year_list = []
title_list = []
collection_list = []

for tag in list(bib_data.entries.keys()):
    if 'Author' in bib_data.entries[tag].persons:
        for author_name in list(bib_data.entries[tag].persons['Author']):
            author = author_name.last_names[0][1:-1]
            if 'journal' in bib_data.entries[tag].fields._dict:
                journal = bib_data.entries[tag].fields._dict['journal'][1:]
            else:
                journal = np.nan
                
            if 'volume' in bib_data.entries[tag].fields._dict: 
                volume = int(bib_data.entries[tag].fields._dict['volume'])
            else:
                journal = np.nan
                
            if 'pages' in bib_data.entries[tag].fields._dict: 
                pages = bib_data.entries[tag].fields._dict['pages']
            else:
                journal = np.nan
                
            if 'year' in bib_data.entries[tag].fields._dict: 
                year = int(bib_data.entries[tag].fields._dict['year'])
            else:
                journal = np.nan
                
            if 'title' in bib_data.entries[tag].fields._dict: 
                title = bib_data.entries[tag].fields._dict['title'][1:-1]
            else:
                journal = np.nan
                
            collection = 'astronomy'
            
            tag_list.append(tag)
            author_list.append(author)
            journal_list.append(journal)
            volume_list.append(volume)
            pages_list.append(pages)
            year_list.append(year)
            title_list.append(title)
            collection_list.append(collection)
            
df = pd.DataFrame({'tag': tag_list, 'author': author_list,
                    'journal' : journal_list, 'volume' : volume_list,
                    'pages' : pages_list, 'year' : year_list,
                    'title' : title_list, 'collection' : collection_list})

In [8]:
import pandas as pd
import sqlite3

!rm bibliography.db

connection = sqlite3.connect("bibliography.db")

cursor = connection.cursor()

sql_cmd = """CREATE TABLE bibliography 
            (iid INTEGER  NOT NULL  PRIMARY KEY  AUTOINCREMENT DEFAULT 0, 
            tag TEXT, 
            author TEXT, 
            journal TEXT, 
            volume FLOAT, 
            pages TEXT,
            year FLOAT,
            title TEXT,
            collection TEXT)"""

cursor.execute(sql_cmd)
connection.commit()

In [9]:
# if no author, sometimes it's editor.
df = df.where(pd.notnull(df), None)
for ii, row in df.iterrows():
    iparams = (row['tag'], row['author'], row['journal'], row['volume'], 
               row['pages'], row['year'], row['title'], row['collection'])
    sql_cmd = """INSERT INTO bibliography
                (tag, author, journal, volume, pages, year, title, collection)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)"""
    
    cursor.execute(sql_cmd, iparams)
connection.commit()

In [10]:
sql_cmd = "SELECT tag FROM bibliography"
cursor.execute(sql_cmd)

db_info = cursor.fetchall()
# for entry in db_info:
#     print(entry)
    
print(len(np.unique(db_info)))
print(len(bib_data.entries.keys())) # one entry had no authors, but rather, editor.

45
46


# SCRATCH WORK BELOW

Useful things (besides flask website):

    * https://medium.com/featurepreneur/uploading-files-using-flask-ec9fb4c7d438

In [2]:
%%writefile fhello.py
from flask import Flask
app = Flask(__name__)

@app.route("/")
def hello():
    return "Hello World, for real!"

if __name__ == "__main__":
    app.run(port=8000)

Overwriting fhello.py


In [2]:
%run fhello.py 

 * Serving Flask app 'fhello' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8000/ (Press CTRL+C to quit)
127.0.0.1 - - [08/Apr/2022 18:46:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Apr/2022 18:47:00] "GET /favicon.ico HTTP/1.1" 404 -


In [3]:
%%writefile urls.py
from flask import Flask

app = Flask(__name__)
app.debug = True

## this will route / and /hello 
## the function "hi()" is called a "view function"
@app.route("/")
@app.route('/hello')
def hi():
    return "<font color='red'>Hello!</font>"

## this will route URLs like: /user/josh 
@app.route('/user/<username>')
def show_user_profile(username):
    return 'hello, username =  %s' % username

## this will route URLs like: /post/1234
@app.route('/post/<int:post_id>')
def show_post(post_id):
    return 'Post # = %d' % post_id

# here we show off multiple input and defaults
@app.route("/doc/<int:docid>/page/<int:pageid>")
@app.route("/doc/<int:docid>", defaults={'pageid': 10})
def show_document_pages(docid,pageid):
    return "Doc = %i  and Page = %i" % (docid,pageid)

## a different way to add URL rules
## this connects the function hi() to the url /hola
## nice thing by doing it this way is that you could see all your
## mappings in one place
app.add_url_rule('/hola', "say_hola", hi)

if __name__ == "__main__":
    app.run(port=5012)

Writing urls.py


In [5]:
%run urls.py 

 * Serving Flask app 'urls' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5012/ (Press CTRL+C to quit)
 * Restarting with stat
 * Debugger is active!
 * Debugger PIN: 450-013-370


In [2]:
from pybtex.database import parse_file    

bib_data = parse_file('/home/jovyan/python-seminar/Homeworks/hw_7/hw_7_data/homework_7_refs.bib', "bibtex")

In [1]:
%%writefile app.py
# https://flask.palletsprojects.com/en/2.1.x/patterns/fileuploads/
import os
from flask import Flask, flash, request, redirect, url_for
from werkzeug.utils import secure_filename

# UPLOAD_FOLDER = '/path/to/the/uploads'
UPLOAD_FOLDER = '/home/jovyan/python-ay250-homeworks/hw_7/static'
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/', methods=['GET', 'POST'])
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # If the user does not select a file, the browser submits an
        # empty file without a filename.
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            return redirect(url_for('download_file', name=filename))
    return '''
    <!doctype html>
    <title>Upload new File</title>
    <h1>Upload new File</h1>
    <form method=post enctype=multipart/form-data>
      <input type=file name=file>
      <input type=submit value=Upload>
    </form>
    '''

from flask import send_from_directory

@app.route('/uploads/<name>')
def download_file(name):
    return send_from_directory(app.config["UPLOAD_FOLDER"], name)

if __name__ == '__main__':
    app.run(port=8000, debug = True)

Overwriting app.py
