### JSON Objects in PostgreSQL

**Author**: Aaron Liu & Ron Volkovinsky

**Date**: 5/10/2023

**Objective**: Practice basic insert queries using SQL, and the corresponding programming to automate the process in psycopg2

## Setup

Some functions in python are already given in the cells below:

**DOI JSON retrieval (doi2dict)**
(Credit Ron Volkovinsky)

In [None]:
import requests
import json
import pandas as pd
import bibtexparser
import pprint

def doi2dict(doi):
    #create url
    url = "http://dx.doi.org/" + doi
    
    #create dictionary of http bibtex headers that requests will retrieve from the url
    headers = {"accept": "application/x-bibtex"}
    
    #reqeusts information specified by bibtex from url
    r = requests.get(url, headers = headers).text    

    #parse the returned bibtex text to a dictionary
    #NOTE: USE bibtexparser.customization to split strings into list, etc. (https://bibtexparser.readthedocs.io/en/master/bibtexparser.html?highlight=bparser#module-bibtexparser.bparser)
    bibdata = bibtexparser.bparser.BibTexParser().parse(r)
    
    #return dict of metadata
    #return bibdata.entries[0]
    if len(bibdata.entries) > 0:
        return bibdata.entries[0]
    else:
        return None


doi = '10.1021/acsami.1c20994'
doi2 = '10.1021/.9b00476'

doidict = doi2dict(doi2)
print(doidict)

**Connection Details**

Fill in your connection details here. Note that `127.0.0.1`, `localhost`, and your **local IP address** (found using the `ipconfig` command in your command line) are all synonymous with your local computer as a server. If you are connecting to an external server, you of course need to find the appropriate connection details of that server.

I recommend creating your own database as a test environment for interacting with your database. You must do this either through psql or pgAdmin, externally from Python. Call the database whatever you want, like `pg_practice` or `ofetdb_testenv`, etc. Either way, the default username and password are what go into the connection details. The port by default for PostgreSQL is almost always `5432`, unless this was specified differently during your installation of PostgreSQL.

In [None]:
import psycopg2 as pg
import sys


conn_kwargs = {
    "host"      : "localhost",
    "database"  : "test_tutor", ## FILL IN CONNECTION DETAILS HERE
    "user"      : "postgres",
    "password"  : "password",
    "port"      : "5432",
}

def connect(**params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

conn = connect(**conn_kwargs)

conn.close()
print("Connection Closed")

In [None]:
# Postgres python
import psycopg2 as pg
import numpy as np
from psycopg2.extensions import AsIs

# import os
# import functools
# import sys

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = pg.connect(**conn_kwargs)
        cur = conn.cursor()
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
#         # Fetch result
#         fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return 

In [None]:
# Create a table that holds journal article information
conn = connect(**conn_kwargs)

cur = conn.cursor()

sql = '''

    DROP TABLE IF EXISTS EXPERIMENT_INFO;

    CREATE TABLE EXPERIMENT_INFO (
        exp_id              SERIAL          PRIMARY KEY,
        citation_type       VARCHAR(20),
        meta                JSONB,
        UNIQUE(citation_type, meta)
    );
'''

cur.execute(sql)
conn.commit()

print("Operation successful")
conn.close()

In [None]:
doidict

In [None]:
from psycopg2.extras import Json

doi = '10.1021/acsami.1c20994'
doi2 = '10.1021/acscentsci.9b00476'

doidict = doi2dict(doi)

a = Json(doidict)
print(a)

In [None]:
import psycopg2

kwargs = {
    'database': 'test_tutor',
    'user': 'postgres',
    'password': 'password',
    'host': '127.0.0.1',
    'port': '5432'
}

# %% Create Tables for EXPERIMENT_INFO

conn = psycopg2.connect(**kwargs)

print("Connection Successful")

cur = conn.cursor()
cur.execute(
    '''
    CREATE TABLE IF NOT EXISTS EXPERIMENT_INFO (
        exp_id              SERIAL          PRIMARY KEY,
        citation_type       VARCHAR(20),
        meta                JSONB,
        UNIQUE(citation_type, meta)
    );
    '''
)

print("Table(s) created successfully")
conn.commit()

print("Operation successful")
conn.close()

In [None]:
Json(doidict)

In [None]:
sql = "INSERT INTO experiment_info(%s) VALUES %s"

columns = ['citation_type', 'meta']
values = ['literature', Json(doidict)]

tup = (AsIs(','.join(columns)), tuple(values))

conn = psycopg2.connect(**kwargs)

print("Connection Successful")

cur = conn.cursor()
cur.execute(sql, tup)

print("Table(s) created successfully")
conn.commit()

print("Operation successful")
conn.close()


### Exercise

Populate a journal article DOI table

1. Create a table called Articles. We want the table to contain columns that store a unique id for each row, the year it was published, and a metadata field with variable information mined using bibtexparser
2. Populate the table with all the information using the Excel file "articles.xlsx" provided

#### Create table

In [None]:
conn_kwargs= {
    'database': 'test_tutor',
    'user': 'postgres',
    'password': 'password',
    'host': 'localhost',
    'port': '5432'
}

In [None]:
# pg.connect returns a connection instance, based on the login parameters
conn = pg.connect(**conn_kwargs)
print("Connection Successful")

# A cursor object is used to query the database from Python
cur = conn.cursor()

# The execute command takes a query as an argument. This query is creating a SOLVENT table. 
# Note, inside the triple quotes is exactly what you would type into an SQL interface
sql = '''
    
    DROP TABLE IF EXISTS ARTICLES;
    
    CREATE TABLE ARTICLES (
        id            BIGSERIAL         PRIMARY KEY,
        year          INT ,
        meta          JSONB,
        UNIQUE(meta)
    ); 
    '''

cur.execute(sql)


print("Table(s) created successfully")
conn.commit()

print("Operation successful")
conn.close()

#### Get data

In [None]:
##from doi into json file type
df = pd.read_excel('articles.xlsx')

#### Insert Data Into Table

In [None]:
columns = ["year", "meta"]

sql = '''
INSERT INTO ARTICLES (%s) VALUES %s
ON CONFLICT (meta) 
DO UPDATE SET meta = excluded.meta;
'''

for index , row in df.iterrows():
    article= row['doi']
    doi = doi2dict(article)
    if doi is not None:
        data = Json(doi)
        values= [doi['year'],data]
        tup = (AsIs(','.join(columns)),tuple(values))
        pg_query(sql,tup)
    else:
        print(f'No metadata found for doi={article}')

    
#     values = tuple(row)
#     tup = (AsIs(','.join(columns)),values)
#     pg_insert(sql,tup)

In [None]:
## I left off talking about inserting new tuples that already exist... and violating key constraints. What about sequencing?
## Let's insert like 5 doi's, see what happens