## Python Code to Extract Data From Template and Transfer to PostGRE SQL
#### Authors : Aaron Liu, Rahul Venkatesh, Jessica Bonsu, Myeongyeon Lee 
##### Date Edited : 06-07-2023

In [1]:
## Required Packages

import pandas as pd
import numpy as np
import psycopg2 as pg

import os
from psycopg2.extras import Json
from psycopg2.extensions import AsIs
import functools
import json
import sys

import requests
# import bibtexparser
import pprint

In [2]:
## Required Functions

# Function to remove rows that have no value (NaN) in the second column
def remove_emptyrows(df):
    nan_mask = ~df.iloc[:,1].isna() 
    return df[nan_mask]

# create boolean mask where "True" represents non-missing value
# create boolean mask where "False" represents mssing value
# for second column (all row)

# Function to convert a sheet into dictionary data type
def read_sheet(filepath, sheet_name, ordering=False, usecols="A,B,D", meas=False):

    ## NOTE: ADD AN ARGUMENT TO DECIDE WHETHER OR NOT TO BRACKET THE SHEET
    ## NOTE : The argument "ordering" is used for sheets like solution processing or substrate pretreatmant where the order of the processing step matters
    ## NOTE : The argument "usecols" is to store information from particular columns in the excel sheet
    ## NOTE : The argument "meas" is used to 
    
    ## Read Sheet Information
    df = pd.read_excel(
        filepath,
        sheet_name=sheet_name,
        usecols=usecols
    )
    
    # Call Function To Remove empty rows
    df_ = remove_emptyrows(df)
    
    # Create an empty dictionary
    sheet_dict = dict()

    # To account for sheets where processing order is important
    if ordering==True:
        df_list = split_df(df_) #calls function split_df
        for i, df in enumerate(df_list):
            sheet_dict[i] = table_to_dict(df) #adds each table to the dictionary
    else:
        sheet_dict = table_to_dict(df_)
    
    return sheet_dict #returns a dataframe

def split_df(df_):
    #For sheets where processing order is important, this function finds tables with '#' in the name of the first column title and turns it into a df
    
    split_idx_mask = df_.iloc[:,0].str.contains('#') #Find the object splits
    # Boolean mask
    # True = including '#' symbol in the first column
    # False = Not including '#' symbol in the frist column
    w = df_[split_idx_mask].index.values
    #index values of the true rows.
    
    df_list = []
    
    for i in range(len(w)-1):
        #why range is len(w)-1, because of the counting start 0? or 1?
        next_df = df_.loc[w[i]+1:w[i+1]-1,:]
        # I don't understand why loc[w[i]+1:w[i+1]]
        df_list.append(next_df)   
    
    return df_list

def table_to_dict(df_):
    
    main_mask = pd.isna(df_.JSON) # it flags rows that dont have a value for JSON column
    step_dict = dict(df_[main_mask].iloc[:,:2].values) # Stores rows that have "NaN" for JSON column in df_ as dict

    
    
    for json_field in pd.unique(df_.JSON): #read through unique JSON types (e.g. NaN, meta or data)

        if pd.isna(json_field): #ignore fields with JSON type as NaN
            continue
            
        # dictionary to store information with JSON type "data"
        elif json_field=='data':
            data_mask = df_.JSON=='data'
            
            # lump key:value pairs into a second nested data dict
            step_dict['data'] = dict()
            
            for i, s in df_[data_mask].iterrows():
                step_dict['data'][s[s.index[0]]] = s['value':'error_type'].dropna().to_dict()
        else:
            json_mask = df_.JSON==json_field
            step_dict[json_field] = dict(df_[json_mask].iloc[:,:2].values) # creates a new key for JSON types like meta and params and adds its corresponding values to it 

    return step_dict

# f = pd.ExcelFile(fpath)


### Reading and Extracting Data From Sheets in Template

In [3]:
#Reading Data From Sheets in Template

fpath = r'..\db_feed\v6_example.xlsx' #Add path for template file

#Storing each sheet in the template file as a dictionary
exp_info = read_sheet(fpath, 'Data Origin')
solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
device_fab = read_sheet(fpath, 'Device Fabrication')
substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
coating_process = read_sheet(fpath, 'Coating Process')
post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [5]:
exp_info

{'citation_type': 'literature',
 'meta': {'first_name': 'Rahul',
  'last_name': 'Venkatesh',
  'email': 'rvenkatesh6@gatech.edu',
  'doi': '10.1038/srep24476 ',
  'publication_type': 'journal_article'}}

### Transferring Information From Template To PostgreSQL

In [28]:
# Postgres python
from psycopg2.extras import Json 

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

param_dict = {
    "host"      : "127.0.0.1",
    "database"  : "ofetdb_testenv",
    "user"      : "postgres",
    "password"  : "password",
    "port"      : "5432",
}

def connect(params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = connect(param_dict)
        cur = conn.cursor()
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
        # Fetch result
        fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return fetched #return query result

In [30]:
values

['literature', <psycopg2._json.Json at 0x2c953146dc0>]

In [6]:
from psycopg2.extras import Json

def convert_entry(entry_dict):

    pg_entry = entry_dict
    for key in pg_entry.keys():
        if type(pg_entry[key])==dict:
            pg_entry[key]=Json(pg_entry[key])
    columns = pg_entry.keys()
    values = [pg_entry[column] for column in columns]
    
    return pg_entry, columns, values


def insert_from_template(fpath):
    """
    This function reads a template, and makes all the necessary data validation checks to insert into PostgreSQL. The associated
    directory will include this template, as well as any associated data files in a "data" folder
    """
    
    # Step 1: Read all the sheets, and store them as tailored objects
    exp_info = read_sheet(fpath, 'Data Origin')
    solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
    solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
    device_fab = read_sheet(fpath, 'Device Fabrication')
    substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
    coating_process = read_sheet(fpath, 'Coating Process')
    post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
    device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
    other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)
    
    # Step 2: Extract experiment_info from the appropriate sheet, and query check if experiment already exists with the citation
    # and metadata info
    
    pg_dict, columns, values = convert_entry(exp_info)

    sql = '''
        INSERT INTO experiment_info (%s) 
        VALUES %s
        ON CONFLICT (citation_type, meta) DO UPDATE
        SET (%s) = %s
        RETURNING exp_id

        '''
    tup = (AsIs(','.join(columns)), tuple(values), AsIs(','.join(columns)), tuple(values))

    exp_id = pg_query(sql, tup) #assign exp_id
    
    # Step 3: Extract table information for all solution tables from SOLUTION_MAKEUP template
    print(solution_makeup)
#     pg_dict, columns, values = convert_entry(solution_makeup)
#     print(solution_makeup)
    
    pass

# insert_from_template(fpath)

In [92]:
for key in solution_makeup.keys():
    print(solution_makeup[key])

{'entity_type': 'solution', 'concentration': 4}
{'entity_type': 'solvent', 'iupac_name': 'toluene', 'pubchem_cid': 1140, 'vol_frac': 1}
{'entity_type': 'polymer', 'common_name': 'DPP-DTT', 'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]', 'wt_frac': 1}


In [23]:
pg_dict, columns, values = convert_entry(exp_info)

sql = '''
    INSERT INTO experiment_info (%s) 
    VALUES %s
    ON CONFLICT (citation_type, meta) DO UPDATE
    SET (%s) = %s
    RETURNING exp_id
    
    '''
tup = (AsIs(','.join(columns)), tuple(values), AsIs(','.join(columns)), tuple(values))

exp_id = pg_query(sql, tup)
exp_id



Connecting to the PostgreSQL database...
Connection successful
Error: duplicate key value violates unique constraint "experiment_info_pkey"
DETAIL:  Key (exp_id)=(5) already exists.



UnboundLocalError: local variable 'fetched' referenced before assignment

In [21]:
def assign_exp_id(exp_info_dict):
    """
    Parameters
    ----------
    exp_info_dict : DataFrame
        The information read from the template, in the form of a dict

    Returns
    -------
    exp_id : int
        The pk of the Experiment_Info table if there is a matching entry
    """
    
    
    
    return exp_id