### JSON Objects in PostgreSQL

**Author**: Aaron Liu & Ron Volkovinsky

**Date**: 5/10/2023

**Objective**: Practice basic insert queries using SQL, and the corresponding programming to automate the process in psycopg2

## Setup

Some functions in python are already given in the cells below:

**DOI JSON retrieval (doi2dict)**
(Credit Ron Volkovinsky)

In [1]:
import requests
import json
import pandas as pd
import bibtexparser
import pprint

def doi2dict(doi):
    #create url
    url = "http://dx.doi.org/" + doi
    
    #create dictionary of http bibtex headers that requests will retrieve from the url
    headers = {"accept": "application/x-bibtex"}
    
    #reqeusts information specified by bibtex from url
    r = requests.get(url, headers = headers).text    

    #parse the returned bibtex text to a dictionary
    #NOTE: USE bibtexparser.customization to split strings into list, etc. (https://bibtexparser.readthedocs.io/en/master/bibtexparser.html?highlight=bparser#module-bibtexparser.bparser)
    bibdata = bibtexparser.bparser.BibTexParser().parse(r)
    
    #return dict of metadata
    return bibdata.entries[0]

#doi = '10.1021/acsami.1c20994'
doi2 = '10.1021/acscentsci.9b00476'

doidict = doi2dict(doi2)



**Connection Details**

Fill in your connection details here. Note that `127.0.0.1`, `localhost`, and your **local IP address** (found using the `ipconfig` command in your command line) are all synonymous with your local computer as a server. If you are connecting to an external server, you of course need to find the appropriate connection details of that server.

I recommend creating your own database as a test environment for interacting with your database. You must do this either through psql or pgAdmin, externally from Python. Call the database whatever you want, like `pg_practice` or `ofetdb_testenv`, etc. Either way, the default username and password are what go into the connection details. The port by default for PostgreSQL is almost always `5432`, unless this was specified differently during your installation of PostgreSQL.

In [2]:
import psycopg2 as pg

conn_kwargs = {
    "host"      : "localhost",
    "database"  : "ofetdb_authorinfo", ## FILL IN CONNECTION DETAILS HERE
    "user"      : "postgres",
    "password"  : "Rahul2411!",
    "port"      : "5432",
}

def connect(**params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

conn = connect(**conn_kwargs)

conn.close()
print("Connection Closed")

Connecting to the PostgreSQL database...
Connection successful
Connection Closed


In [3]:
# Postgres python
import psycopg2 as pg
import numpy as np
from psycopg2.extensions import AsIs

# import os
# import functools
# import sys

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = pg.connect(**conn_kwargs)
        cur = conn.cursor()
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
#         # Fetch result
#         fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return 

In [4]:
# Create a table that holds journal article information
conn = connect(**conn_kwargs)

cur = conn.cursor()

sql = '''

    DROP TABLE IF EXISTS EXPERIMENT_INFO;

    CREATE TABLE EXPERIMENT_INFO (
        exp_id              SERIAL          PRIMARY KEY,
        citation_type       VARCHAR(20),
        meta                JSONB,
        UNIQUE(citation_type, meta)
    );
'''

cur.execute(sql)
conn.commit()

print("Operation successful")
conn.close()

Connecting to the PostgreSQL database...
Connection successful
Operation successful


In [9]:
from psycopg2.extras import Json

#doi = '10.1021/acsami.1c20994'
doi2 = '10.1021/acscentsci.9b00476'

doidict = doi2dict(doi)

a = Json(doidict)

print(a)


'{"journal": "Journal of Materials Chemistry C", "title": "Engineering gate dielectric surface properties for enhanced polymer field-effect transistor performance", "author": "Yanlian Lei and Bo Wu and Wing-Kin Edward Chan and Furong Zhu and Beng S. Ong", "pages": "12267--12272", "number": "47", "volume": "3", "publisher": "Royal Society of Chemistry ({RSC})", "year": "2015", "url": "https://doi.org/10.1039%2Fc5tc02579f", "doi": "10.1039/c5tc02579f", "ENTRYTYPE": "article", "ID": "Lei_2015"}'


In [10]:
Json(doidict)

<psycopg2._json.Json at 0x1ca4c5f6198>

In [None]:
sql = "INSERT INTO experiment_info(%s) VALUES %s"

columns = ['citation_type', 'meta']
values = ['literature', Json(doidict)]

tup = (AsIs(','.join(columns)), tuple(values))

conn = pg.connect(**conn_kwargs)

print("Connection Successful")

cur = conn.cursor()
cur.execute(sql, tup)

print("Table(s) created successfully")
conn.commit()

print("Operation successful")
conn.close()


In [None]:
delete_query = "DELETE FROM EXPERIMENT_INFO*"

# pg.connect returns a connection instance, based on the login parameters
conn = pg.connect(**conn_kwargs)
print("Connection Successful")

# A cursor object is used to query the database from Python
cur = conn.cursor()

cur.execute(delete_query)

print("All rows deleted")
conn.commit()

print("Operation successful")
conn.close()
print("Connection closed")

### Exercise

Populate a journal article DOI table

1. Create a table called Articles. We want the table to contain columns that store a unique id for each row, the year it was published, and a metadata field with variable information mined using bibtexparser
2. Populate the table with all the information using the Excel file "articles.xlsx" provided

In [11]:
import pandas as pd

df = pd.read_excel('articles - Copy.xlsx')
df

Unnamed: 0,doi
0,10.1039/C5TC02579F
1,10.1021/acs.chemmater.7b03019
2,10.1021/acs.chemmater.8b05224
3,10.1038/srep00754
4,10.1038/srep24476
5,10.1016/j.cplett.2020.137507
6,10.1021/acs.chemmater.0c02199
7,10.1016/j.spmi.2018.12.022
8,10.1039/c8sm02517g
9,10.1002/adfm.201909787


In [12]:
# create a new table in postgreSQL called articles

conn = connect(**conn_kwargs)

cur = conn.cursor()

sql = '''

    DROP TABLE IF EXISTS ARTICLE_INFO;

    CREATE TABLE ARTICLE_INFO (
        article_id          SERIAL          PRIMARY KEY,
        First_Author        VARCHAR(255),
        year                INTEGER,
        journal             VARCHAR(255),
        doi                VARCHAR(255),
        UNIQUE(First_Author,year,journal,doi)
    );
'''

cur.execute(sql)
conn.commit()

print("Operation successful")
conn.close()

Connecting to the PostgreSQL database...
Connection successful
Operation successful


In [13]:
#returning just the first author

import re

def split_authors(author_string):
    authors = re.split(r' and ', author_string)
    return authors[0]

author_string = "Yanlian Lei and Bo Wu and Wing-Kin Edward Chan and Furong Zhu and Beng S. Ong"
authors = split_authors(author_string)
print(authors)


Yanlian Lei


In [8]:
df_copy = df.copy()

df_copy['First Author'] = np.nan
df_copy['First Author'] = df_copy['First Author'].astype('object')
df_copy['Year'] = np.nan
df_copy['Journal'] = np.nan
df_copy['Journal'] = df_copy['Journal'].astype('object')

# Iterate through rows of the dataframe
for index, row in df_copy.iterrows():
    doi = row['doi']
    metadata = Json(doi2dict(doi))
    author = metadata.adapted['author']
    year = metadata.adapted['year']
    journal = metadata.adapted['journal']
    first_author = split_authors(author)
    df_copy.at[index, 'First Author'] = first_author
    df_copy.at[index, 'Year'] = year
    df_copy.at[index, 'Journal'] = journal

    
df_copy    

NameError: name 'Json' is not defined

In [None]:
sql = '''
INSERT INTO ARTICLE_INFO (%s) VALUES %s
'''
df_copy = df_copy.iloc[:, -2:] #storing only the year and metadata info

columns = list(df_copy.columns)
for row in df_copy.itertuples(index=False):
    values = tuple(row)
    tup = (AsIs(','.join(columns)),values)
    pg_query(sql,tup)

In [None]:
## I left off talking about inserting new tuples that already exist... and violating key constraints. What about sequencing?
## Let's insert like 5 doi's, see what happens