## Python Code to Extract Data From Template and Transfer to PostGRE SQL
#### Authors : Aaron Liu, Rahul Venkatesh, Jessica Bonsu, Myeongyeon Lee 
##### Date Edited : 06-07-2023

In [27]:
## Required Packages

import pandas as pd
import numpy as np
import psycopg2 as pg

import os
from psycopg2.extras import Json
from psycopg2.extensions import AsIs
import functools
import json
import sys

import requests
# import bibtexparser
import pprint

In [2]:
## Required Functions To Extract Information from Template

# Function to remove rows that have no value (NaN) in the second column
def remove_emptyrows(df):
    nan_mask = ~df.iloc[:,1].isna() 
    return df[nan_mask]

# Function to convert a sheet into dictionary data type
def read_sheet(filepath, sheet_name, ordering=False, usecols="A,B,D", meas=False):

    ## NOTE: ADD AN ARGUMENT TO DECIDE WHETHER OR NOT TO BRACKET THE SHEET
    ## NOTE : The argument "ordering" is used for sheets like solution processing or substrate pretreatmant where the order of the processing step matters
    ## NOTE : The argument "usecols" is to store information from particular columns in the excel sheet
    ## NOTE : The argument "meas" is used to 
    
    ## Read Sheet Information
    df = pd.read_excel(
        filepath,
        sheet_name=sheet_name,
        usecols=usecols
    )
    
    # Call Function To Remove empty rows
    df_ = remove_emptyrows(df)
    
    # Create an empty dictionary
    sheet_dict = dict()

    # To account for sheets where processing order is important
    if ordering==True:
        df_list = split_df(df_) #calls function split_df
        for i, df in enumerate(df_list):
            sheet_dict[i] = table_to_dict(df) #adds each table to the dictionary
    else:
        sheet_dict = table_to_dict(df_)
    
    return sheet_dict #returns a dataframe

def split_df(df_):
    #For sheets where processing order is important, this function finds tables with '#' in the name of the first column title and turns it into a df
    
    split_idx_mask = df_.iloc[:,0].str.contains('#') #Find the object splits
    w = df_[split_idx_mask].index.values
    
    df_list = []
    
    for i in range(len(w)-1):
        next_df = df_.loc[w[i]+1:w[i+1]-1,:]
        df_list.append(next_df)    
    
    return df_list

def table_to_dict(df_):
    
    main_mask = pd.isna(df_.JSON) # it flags rows that dont have a value for JSON column
    step_dict = dict(df_[main_mask].iloc[:,:2].values) # Stores rows that have "NaN" for JSON column in df_ as dict

    
    
    for json_field in pd.unique(df_.JSON): #read through unique JSON types (e.g. NaN, meta or data)

        if pd.isna(json_field): #ignore fields with JSON type as NaN
            continue
            
        # dictionary to store information with JSON type "data"
        elif json_field=='data':
            data_mask = df_.JSON=='data'
            
            # lump key:value pairs into a second nested data dict
            step_dict['data'] = dict()
            
            for i, s in df_[data_mask].iterrows():
                step_dict['data'][s[s.index[0]]] = s['value':'error_type'].dropna().to_dict()
        else:
            json_mask = df_.JSON==json_field
            step_dict[json_field] = dict(df_[json_mask].iloc[:,:2].values) # creates a new key for JSON types like meta and params and adds its corresponding values to it 

    return step_dict

# f = pd.ExcelFile(fpath)


### Reading and Extracting Data From Sheets in Template

In [3]:
#Reading Data From Sheets in Template

fpath = r'..\db_feed\v6_example.xlsx' #Add path for template file

#Storing each sheet in the template file as a dictionary
exp_info = read_sheet(fpath, 'Data Origin')
solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
device_fab = read_sheet(fpath, 'Device Fabrication')
substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
coating_process = read_sheet(fpath, 'Coating Process')
post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)

  warn(msg)


In [4]:
#Use this code block to check how each sheet has been converted to a dictionary
exp_info

{'citation_type': 'literature',
 'meta': {'first_name': 'Rahul',
  'last_name': 'Venkatesh',
  'email': 'rvenkatesh6@gatech.edu',
  'doi': '10.1038/srep24476 ',
  'publication_type': 'journal_article'}}

### Transferring Information From Template To PostgreSQL

In [5]:
# Postgres python
from psycopg2.extras import Json 

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

param_dict = {
    "host"      : "127.0.0.1",
    "database"  : "ofetdb_testenv",
    "user"      : "postgres",
    "password"  : "Rahul2411!",
    "port"      : "5432",
}

def connect(params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = connect(param_dict)
        cur = conn.cursor()
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
        # Fetch result
        fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return fetched #return query result

In [7]:
from psycopg2.extras import Json

def convert_entry(entry_dict):
    
    #This function reads a dictionary and extracts the column names and values from it
    
    pg_entry = entry_dict
    for key in pg_entry.keys():
        if type(pg_entry[key])==dict:
            pg_entry[key]=Json(pg_entry[key])
    columns = pg_entry.keys()
    values = [pg_entry[column] for column in columns]
    
    return pg_entry, columns, values


#### 1.Checking and Storing Experiment Information

In [41]:
import psycopg2
from psycopg2 import _json

pg_entry, columns, values = convert_entry(exp_info)

# Access the JSON data directly
print(values)
values_updated = ['literature',]
json_data = values[1].adapted
print(json_data)
doi = json_data['doi']
print (doi)


['literature', <psycopg2._json.Json object at 0x000001F0D92CE128>]
{'first_name': 'Rahul', 'last_name': 'Venkatesh', 'email': 'rvenkatesh6@gatech.edu', 'doi': '10.1038/srep24476 ', 'publication_type': 'journal_article'}
10.1038/srep24476 


In [49]:
sql = '''
    INSERT INTO experiment_info (%s) 
    VALUES %s
    ON CONFLICT (citation_type, meta) DO UPDATE
    SET (%s) = %s
    RETURNING exp_id
    
    '''
tup = (AsIs(','.join(columns)), tuple(values), AsIs(','.join(columns)), tuple(values))

exp_id = pg_query(sql, tup)
exp_id


Connecting to the PostgreSQL database...
Connection successful
Error: duplicate key value violates unique constraint "experiment_info_pkey"
DETAIL:  Key (exp_id)=(9) already exists.



UnboundLocalError: local variable 'fetched' referenced before assignment

In [52]:
import psycopg2

# Establish a connection to the PostgreSQL database
connection = psycopg2.connect(
    host="127.0.0.1",
    database="ofetdb_testenv",
    user="postgres",
    password="Rahul2411!",
    port = '5432'
)

# Create a cursor object
cursor = connection.cursor()


# Check if a record with the same citation_type and meta exists
cursor.execute(
    "SELECT COUNT(*) FROM experiment_info WHERE citation_type = %s AND meta = %s",
    (values[0], values[1])
)
record_count = cursor.fetchone()[0]

# If no record exists, insert the new record
if record_count == 0:
    cursor.execute(
        "INSERT INTO experiment_info (citation_type, meta) VALUES (%s, %s) RETURNING exp_id",
        (values[0], values[1])
    )
    new_exp_id = cursor.fetchone()[0]
    print("New record inserted with exp_id:", new_exp_id)
else:
    print("Record already exists. No action taken.")

# Commit the transaction
connection.commit()

# Close the cursor and the database connection
cursor.close()
connection.close()


UniqueViolation: duplicate key value violates unique constraint "experiment_info_pkey"
DETAIL:  Key (exp_id)=(11) already exists.
