## Python Code to Extract Data From Template and Transfer to PostGRE SQL
#### Authors : Aaron Liu, Rahul Venkatesh, Jessica Bonsu, Myeongyeon Lee 
##### Date Edited : 06-07-2023

In [1]:
## Required Packages

import pandas as pd
import numpy as np
import psycopg2 as pg

import os
from psycopg2.extras import Json
from psycopg2.extensions import AsIs
import functools
import json
import sys

import requests
# import bibtexparser
import pprint



In [2]:
## Required Functions To Extract Information from Template

# Function to remove rows that have no value (NaN) in the second column
def remove_emptyrows(df):
    nan_mask = ~df.iloc[:,1].isna() 
    return df[nan_mask]

# Function to convert a sheet into dictionary data type
def read_sheet(filepath, sheet_name, ordering=False, usecols="A,B,D", meas=False):

    ## NOTE: ADD AN ARGUMENT TO DECIDE WHETHER OR NOT TO BRACKET THE SHEET
    ## NOTE : The argument "ordering" is used for sheets like solution processing or substrate pretreatmant where the order of the processing step matters
    ## NOTE : The argument "usecols" is to store information from particular columns in the excel sheet
    ## NOTE : The argument "meas" is used to 
    
    ## Read Sheet Information
    df = pd.read_excel(
        filepath,
        sheet_name=sheet_name,
        usecols=usecols
    )
    
    # Call Function To Remove empty rows
    df_ = remove_emptyrows(df)
    
    # Create an empty dictionary
    sheet_dict = dict()

    # To account for sheets where processing order is important
    if ordering==True:
        df_list = split_df(df_) #calls function split_df
        for i, df in enumerate(df_list):
            sheet_dict[i] = table_to_dict(df) #adds each table to the dictionary
    else:
        sheet_dict = table_to_dict(df_)
    
    return sheet_dict #returns a dataframe

def split_df(df_):
    #For sheets where processing order is important, this function finds tables with '#' in the name of the first column title and turns it into a df
    
    split_idx_mask = df_.iloc[:,0].str.contains('#') #Find the object splits
    w = df_[split_idx_mask].index.values
    
    df_list = []
    
    for i in range(len(w)-1):
        next_df = df_.loc[w[i]+1:w[i+1]-1,:]
        df_list.append(next_df)    
    
    return df_list

def table_to_dict(df_):
    
    main_mask = pd.isna(df_.JSON) # it flags rows that dont have a value for JSON column
    step_dict = dict(df_[main_mask].iloc[:,:2].values) # Stores rows that have "NaN" for JSON column in df_ as dict

    
    
    for json_field in pd.unique(df_.JSON): #read through unique JSON types (e.g. NaN, meta or data)

        if pd.isna(json_field): #ignore fields with JSON type as NaN
            continue
            
        # dictionary to store information with JSON type "data"
        elif json_field=='data':
            data_mask = df_.JSON=='data'
            
            # lump key:value pairs into a second nested data dict
            step_dict['data'] = dict()
            
            for i, s in df_[data_mask].iterrows():
                step_dict['data'][s[s.index[0]]] = s['value':'error_type'].dropna().to_dict()
        else:
            json_mask = df_.JSON==json_field
            step_dict[json_field] = dict(df_[json_mask].iloc[:,:2].values) # creates a new key for JSON types like meta and params and adds its corresponding values to it 

    return step_dict

# f = pd.ExcelFile(fpath)


### Reading and Extracting Data From Sheets in Template

In [66]:
#Reading Data From Sheets in Template

fpath = r'..\db_feed\v6_example.xlsx' #Add path for template file

#Storing each sheet in the template file as a dictionary
exp_info = read_sheet(fpath, 'Data Origin')
solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
device_fab = read_sheet(fpath, 'Device Fabrication')
substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
coating_process = read_sheet(fpath, 'Coating Process')
post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)

In [67]:
#Use this code block to check how each sheet has been converted to a dictionary
solution_makeup

{0: {'entity_type': 'solution', 'concentration': 4},
 1: {'entity_type': 'solvent',
  'iupac_name': 'toluene',
  'pubchem_cid': 1140,
  'vol_frac': 1,
  'meta': {'supplier': 'Sigma Aldrich', 'batch_number': 's1234'}},
 2: {'entity_type': 'polymer',
  'common_name': 'DPP-DTT',
  'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]',
  'mn': 55,
  'mw': 199,
  'dispersity': 3.62,
  'wt_frac': 1,
  'meta': {'supplier': 'Ossila', 'batch_number': 'M0311A2'}}}

### Transferring Information From Template To PostgreSQL

In [77]:
# Postgres python
from psycopg2.extras import Json 

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

param_dict = {
    "host"      : "127.0.0.1",
    "database"  : "ofetdb_testenv_RV",
    "user"      : "postgres",
    "password"  : "Rahul2411!",
    "port"      : "5432",
}

def connect(params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = connect(param_dict)
        cur = conn.cursor()
        
        
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
        # Fetch result
        fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return fetched #return query result

In [11]:
from psycopg2.extras import Json

def convert_entry(entry_dict):
    
    #This function reads a dictionary and extracts the column names and values from it
    
    pg_entry = entry_dict
    for key in pg_entry.keys():
        if type(pg_entry[key])==dict:
            pg_entry[key]=Json(pg_entry[key])
    columns = pg_entry.keys()
    values = [pg_entry[column] for column in columns]
    
    return pg_entry, columns, values


### 1.Checking and Storing Experiment Information

In [127]:
import psycopg2
from psycopg2 import _json

pg_entry, columns, values = convert_entry(exp_info)

# Access the JSON data directly
# citation_type_data = values[0]
# meta_data = values[1].adapted
# print(citation_type_data)
# print(meta_data)



In [158]:
print(type(pg_entry))
print(type(columns))
print(columns)
print(type(values))
print(values)

<class 'dict'>
<class 'dict_keys'>
dict_keys([0, 1, 2])
<class 'list'>
[<psycopg2._json.Json object at 0x00000288F9EC3BA8>, <psycopg2._json.Json object at 0x00000288FA0FFC50>, <psycopg2._json.Json object at 0x00000288F9EC3B38>]


In [114]:
sql = '''
    INSERT INTO experiment_info (%s) 
    VALUES %s
    ON CONFLICT (citation_type, meta) DO UPDATE
    SET (%s) = %s
    RETURNING exp_id
    
    '''
tup = (AsIs(','.join(columns)), tuple(values), AsIs(','.join(columns)), tuple(values))



exp_id = pg_query(sql, tup)
exp_id


Connecting to the PostgreSQL database...
Connection successful
Operation Successful


[(1,)]

In [27]:
###### Dont forget to assign the exp_id to sample table

### 2.Checking and Storing Solution Information (Polymer, Solvent, Solution)

In [130]:
import psycopg2
from psycopg2 import _json

pg_entry, columns, values = convert_entry(solution_makeup)


# Access the JSON data directly
solution_data = values[0]
solvent_data = values[1].adapted
polymer_data = values[2].adapted

print(solution_data)
print(solvent_data)
print(type(solvent_data))
print(polymer_data)

'{"entity_type": "solution", "concentration": 4}'
{'entity_type': 'solvent', 'iupac_name': 'toluene', 'pubchem_cid': 1140, 'vol_frac': 1, 'meta': <psycopg2._json.Json object at 0x00000288F9F467B8>}
<class 'dict'>
{'entity_type': 'polymer', 'common_name': 'DPP-DTT', 'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]', 'mn': 55, 'mw': 199, 'dispersity': 3.62, 'wt_frac': 1, 'meta': {'supplier': 'Ossila', 'batch_number': 'M0311A2'}}


###### 2.1 Solvent Information 

In [155]:
desired_keys = ['pubchem_cid', 'iupac_name','meta']

solvent_data = {key: solvent_data[key] for key in desired_keys if key in solvent_data}

print(solvent_data)
print(type(solvent_data))

{'pubchem_cid': 1140, 'iupac_name': 'toluene', 'meta': <psycopg2._json.Json object at 0x00000288F9F467B8>}
<class 'dict'>


In [156]:
pg_entry_solvent, solvent_columns, solvent_values = convert_entry(solvent_data)

print(solvent_columns)
print(type(solvent_columns))
print(solvent_values)
print(type(solvent_values))

# #we only need pubchem_cid and iupac name for columns
# solvent_columns = list(solvent_columns)
# solvent_columns = (solvent_columns[2],solvent_columns[1],solvent_columns[4]) #we only need 
# solvent_columns = list(solvent_columns)
# print(solvent_columns)

# #we only need pubchem_cid and iupac name values
# solvent_values =(solvent_values[2],solvent_values[1],solvent_values[4]) 
# solvent_values = list(solvent_values)
# print(solvent_values)

dict_keys(['pubchem_cid', 'iupac_name', 'meta'])
<class 'dict_keys'>
[1140, 'toluene', <psycopg2._json.Json object at 0x00000288F9F467B8>]
<class 'list'>


In [161]:
## Figure out how to do the on conflict thing

sql = '''
    INSERT INTO solvent (%s) 
    VALUES %s
    RETURNING pubchem_cid
    '''
tup = (AsIs(','.join(solvent_columns)), tuple(solvent_values))

pubchem_cid = pg_query(sql, tup)
#pubchem_cid

Connecting to the PostgreSQL database...
Connection successful
Operation Successful


In [85]:
import psycopg2

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    host="127.0.0.1",
    database="ofetdb_testenv_RV",
    user="postgres",
    password="Rahul2411!",
    port ="5432"
)
cursor = conn.cursor()


# Insert data into the "solvent" table
def insert_into_solvent(columns, values):
    column_names = ', '.join(columns)
    placeholders = ', '.join(['%s'] * len(values))

    insert_query = f'''
        INSERT INTO solvent ({column_names})
        VALUES ({placeholders})
    '''
    cursor.execute(insert_query, values)
    conn.commit()

# Example usage:

insert_into_solvent(solvent_columns, solvent_values)

# Close the database connection
cursor.close()
conn.close()


In [111]:
sql = '''
    INSERT INTO solvent (%s) 
    VALUES %s
    ON CONFLICT (iupac_name, meta) DO UPDATE
    SET (%s) = %s
    RETURNING pubchem_cid
    
    '''
tup = (AsIs(','.join(solvent_columns)), tuple(solvent_values), AsIs(','.join(solvent_columns)), tuple(solvent_values))

#pubchem_cid = pg_query(sql, tup)
#pubchem_cid

In [113]:
tup[0]

<psycopg2.extensions.AsIs at 0x288f9f28be8>

#### 3. Checking and Storing Device Information

#### 4. Checking and Storing Film Deposition Information 

#### 5. Checking and Storing the subprocess recipes (Solution Treatment, Substrate Pretreatment, Post Process)

#### 6. Checking and Storing information to the OFET_PROCESS TABLE

#### 7. Checking and Storing information to the SAMPLE TABLE

#### 8. Checking and Storing the measurement information 