## Python Code to Extract Data From Template and Transfer to PostGRE SQL
#### Authors : Aaron Liu, Rahul Venkatesh, Jessica Bonsu, Myeongyeon Lee 
##### Date Edited : 06-07-2023

In [35]:
## Required Packages

import pandas as pd
import numpy as np
import psycopg2 as pg

import os
from psycopg2.extras import Json
from psycopg2.extensions import AsIs
import functools
import json
import sys

import requests
# import bibtexparser
import pprint

In [36]:
## Required Functions To Extract Information from Template

# Function to remove rows that have no value (NaN) in the second column
def remove_emptyrows(df):
    nan_mask = ~df.iloc[:,1].isna() 
    return df[nan_mask]

# Function to convert a sheet into dictionary data type
def read_sheet(filepath, sheet_name, ordering=False, usecols="A,B,D", meas=False):

    ## NOTE: ADD AN ARGUMENT TO DECIDE WHETHER OR NOT TO BRACKET THE SHEET
    ## NOTE : The argument "ordering" is used for sheets like solution processing or substrate pretreatmant where the order of the processing step matters
    ## NOTE : The argument "usecols" is to store information from particular columns in the excel sheet
    ## NOTE : The argument "meas" is used to 
    
    ## Read Sheet Information
    df = pd.read_excel(
        filepath,
        sheet_name=sheet_name,
        usecols=usecols
    )
    
    # Call Function To Remove empty rows
    df_ = remove_emptyrows(df)
    
    # Create an empty dictionary
    sheet_dict = dict()

    # To account for sheets where processing order is important
    if ordering==True:
        df_list = split_df(df_) #calls function split_df
        for i, df in enumerate(df_list):
            sheet_dict[i] = table_to_dict(df) #adds each table to the dictionary
    else:
        sheet_dict = table_to_dict(df_)
    
    return sheet_dict #returns a dataframe

def split_df(df_):
    #For sheets where processing order is important, this function finds tables with '#' in the name of the first column title and turns it into a df
    
    split_idx_mask = df_.iloc[:,0].str.contains('#') #Find the object splits
    w = df_[split_idx_mask].index.values
    
    df_list = []
    
    for i in range(len(w)-1):
        next_df = df_.loc[w[i]+1:w[i+1]-1,:]
        df_list.append(next_df)    
    
    return df_list

def table_to_dict(df_):
    
    main_mask = pd.isna(df_.JSON) # it flags rows that dont have a value for JSON column
    step_dict = dict(df_[main_mask].iloc[:,:2].values) # Stores rows that have "NaN" for JSON column in df_ as dict

    
    
    for json_field in pd.unique(df_.JSON): #read through unique JSON types (e.g. NaN, meta or data)

        if pd.isna(json_field): #ignore fields with JSON type as NaN
            continue
            
        # dictionary to store information with JSON type "data"
        elif json_field=='data':
            data_mask = df_.JSON=='data'
            
            # lump key:value pairs into a second nested data dict
            step_dict['data'] = dict()
            
            for i, s in df_[data_mask].iterrows():
                step_dict['data'][s[s.index[0]]] = s['value':'error_type'].dropna().to_dict()
        else:
            json_mask = df_.JSON==json_field
            step_dict[json_field] = dict(df_[json_mask].iloc[:,:2].values) # creates a new key for JSON types like meta and params and adds its corresponding values to it 

    return step_dict

# f = pd.ExcelFile(fpath)


### Reading and Extracting Data From Sheets in Template

In [80]:
#Reading Data From Sheets in Template

fpath = r'..\db_feed\v6_example_1_real.xlsx' #Add path for template file

#Storing each sheet in the template file as a dictionary
exp_info = read_sheet(fpath, 'Data Origin')
solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
device_fab = read_sheet(fpath, 'Device Fabrication')
substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
coating_process = read_sheet(fpath, 'Coating Process')
post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)

In [81]:
#Use this code block to check how each sheet has been converted to a dictionary
solution_makeup

{0: {'entity_type': 'solution', 'concentration': 4},
 1: {'entity_type': 'solvent',
  'iupac_name': '1,2-dichlorobenzene',
  'pubchem_cid': 7239,
  'vol_frac': 1},
 2: {'entity_type': 'polymer',
  'common_name': 'DPP-DTT',
  'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]',
  'mn': 55,
  'mw': 199,
  'dispersity': 3.62,
  'wt_frac': 0.6},
 3: {'entity_type': 'polymer',
  'common_name': 'PS',
  'iupac_name': 'poly(styrene)',
  'mn': 2.18,
  'mw': 2.2,
  'dispersity': 1.01,
  'wt_frac': 0.4}}

### Transferring Information From Template To PostgreSQL

In [82]:
# Postgres python
from psycopg2.extras import Json 

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

param_dict = {
    "host"      : "127.0.0.1",
    "database"  : "ofetdb_testenv_RV",
    "user"      : "postgres",
    "password"  : "Rahul2411!",
    "port"      : "5432",
}

def connect(params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def pg_query(sql, tup):
    
    try:
        # Database connection
        conn = connect(param_dict)
        cur = conn.cursor()
        
        
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
        # Fetch result
        fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return fetched #return query result

In [83]:
from psycopg2.extras import Json

def convert_entry(entry_dict):
    
    #This function reads a dictionary and extracts the column names and values from it
    
    pg_entry = entry_dict
    for key in pg_entry.keys():
        if type(pg_entry[key])==dict:
            pg_entry[key]=Json(pg_entry[key])
    columns = pg_entry.keys()
    values = [pg_entry[column] for column in columns]
    
    return pg_entry, columns, values


###### Doubt 1 : 

I made a new database. we were not able to add any new records to the old database

### 1.Checking and Storing Experiment Information

In [84]:
import psycopg2
from psycopg2 import _json

exp_pg_entry, exp_columns, exp_values = convert_entry(exp_info)

#print(type(pg_entry))
#print(type(columns))
#print(columns)
#print(type(values))
#print(values)

In [85]:
sql = '''
    INSERT INTO experiment_info (%s) 
    VALUES %s
    ON CONFLICT (citation_type, meta) DO UPDATE
    SET (%s) = %s
    RETURNING exp_id
    
    '''
tup = (AsIs(','.join(exp_columns)), tuple(exp_values), AsIs(','.join(exp_columns)), tuple(exp_values))



exp_id = pg_query(sql, tup)
exp_id


Connecting to the PostgreSQL database...
Connection successful
Operation Successful


1

In [86]:
###### Dont forget to assign the exp_id to sample table

### 2.Checking and Storing Solution Information (Polymer, Solvent, Solution)

In [87]:
## Last stop - current code doesnt account for multiple polymers and solvents. work on that. Use the entity type.


import psycopg2
from psycopg2 import _json

pg_entry, columns, values = convert_entry(solution_makeup)


# Access the JSON data directly
solution_data = values[0].adapted
solvent_data = values[1].adapted
polymer_data = values[2].adapted

print(solution_data)
print(solvent_data)
#print(type(solvent_data))
print(polymer_data)

{'entity_type': 'solution', 'concentration': 4}
{'entity_type': 'solvent', 'iupac_name': '1,2-dichlorobenzene', 'pubchem_cid': 7239, 'vol_frac': 1}
{'entity_type': 'polymer', 'common_name': 'DPP-DTT', 'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]', 'mn': 55, 'mw': 199, 'dispersity': 3.62, 'wt_frac': 0.6}


###### 2.1 Storing Solvent Information in SOLVENT table

In [88]:
#Extracting only the required information for the SOLVENT table

solvent_desired_keys = ['pubchem_cid', 'iupac_name','meta']

SOLVENT_data = {key: solvent_data[key] for key in solvent_desired_keys if key in solvent_data}

print(SOLVENT_data)
print(type(SOLVENT_data))

{'pubchem_cid': 7239, 'iupac_name': '1,2-dichlorobenzene'}
<class 'dict'>


In [89]:
#Extracting column and values information for the SOLVENT table

pg_entry_solvent, solvent_columns, solvent_values = convert_entry(SOLVENT_data)

print(solvent_columns)
print(type(solvent_columns))
print(solvent_values)
print(type(solvent_values))
print(solvent_values[0])

dict_keys(['pubchem_cid', 'iupac_name'])
<class 'dict_keys'>
[7239, '1,2-dichlorobenzene']
<class 'list'>
7239


###### DOUBT 2 

Currently the SOLVENT table has a UNIQUE value assigned to it. Which means there can only be one chloroform. But what if we have two chlorforms from different vendors having diff meta information. Shouldnt we store both and assign each one an ID?

In [90]:
# Insert into SOLVENT table

sql = '''
    INSERT INTO SOLVENT (%s) 
    VALUES %s
    ON CONFLICT (iupac_name, meta) DO UPDATE
    SET (%s) = %s
    RETURNING pubchem_cid
    
    '''
tup = (AsIs(','.join(solvent_columns)), tuple(solvent_values), AsIs(','.join(solvent_columns)), tuple(solvent_values))



pubchem_cid = pg_query(sql, tup)
pubchem_cid

Connecting to the PostgreSQL database...
Connection successful
Operation Successful


7239

###### 2.2 Storing polymer Information in POLYMER table

In [91]:
#Extracting only the required information for the POLYMER table

polymer_desired_keys = ['common_name', 'iupac_name','mw','mn','dispersity','meta']

POLYMER_data = {key: polymer_data[key] for key in polymer_desired_keys if key in polymer_data}

print(POLYMER_data)
print(type(POLYMER_data))

{'common_name': 'DPP-DTT', 'iupac_name': 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]', 'mw': 199, 'mn': 55, 'dispersity': 3.62}
<class 'dict'>


In [92]:
#Extracting column and values information for the POLYMER table

pg_entry_polymer, polymer_columns, polymer_values = convert_entry(POLYMER_data)

print(polymer_columns)
print(type(polymer_columns))
print(polymer_values)
print(type(polymer_values))

dict_keys(['common_name', 'iupac_name', 'mw', 'mn', 'dispersity'])
<class 'dict_keys'>
['DPP-DTT', 'poly[2,5-(2-octyldodecyl)-3,6-diketopyrrolopyrrole-alt-5,5-(2,5-di(thien-2-yl)thieno [3,2-b]thiophene)]', 199, 55, 3.62]
<class 'list'>


In [54]:
# Insert into POLYMER table

sql = '''
INSERT INTO POLYMER (%s) VALUES %s
ON CONFLICT(common_name,iupac_name,Mn,Mw,dispersity,meta) DO UPDATE
SET (%s) = %s
RETURNING polymer_id
'''

tup = (AsIs(','.join(polymer_columns)), tuple(polymer_values), AsIs(','.join(polymer_columns)), tuple(polymer_values))


polymer_id = pg_query(sql, tup)
polymer_id

Connecting to the PostgreSQL database...
Connection successful
Operation Successful


1

###### 2.3 Storing solution Information in SOLUTION table

In [65]:
#Extracting only the required information for the POLYMER table

solution_desired_keys = ['concentration']

SOLUTION_data = {key: solution_data[key] for key in solution_desired_keys if key in solution_data}

print(SOLUTION_data)
print(type(SOLUTION_data))

{'concentration': 4}
<class 'dict'>


In [66]:
#Extracting column and values information for the SOLUTION table

pg_entry_solution, solution_columns, solution_values = convert_entry(SOLUTION_data)

print(solution_columns)
print(type(solution_columns))
print(solution_values)
print(type(solution_values))

dict_keys(['concentration'])
<class 'dict_keys'>
[4]
<class 'list'>


In [60]:
# Insert into SOLUTION table

sql = '''
INSERT INTO SOLUTION (%s) VALUES %s
RETURNING solution_id
'''

tup = (AsIs(','.join(solution_columns)), tuple(solution_values))


solution_id = pg_query(sql, tup)
solution_id

Connecting to the PostgreSQL database...
Connection successful
Operation Successful


1

###### 2.4 Storing Solvent Information in SOLUTION_MAKEUP_SOLVENT table

In [67]:
#Extracting the volume fraction information for the SOLUTION_MAKEUP_SOLVENT table

solution_makeup_solvent_desired_keys = ['vol_frac']

SOLUTION_MAKEUP_SOLVENT_data = {key: solvent_data[key] for key in solution_makeup_solvent_desired_keys if key in solvent_data}

print(SOLUTION_MAKEUP_SOLVENT_data)
print(type(SOLUTION_MAKEUP_SOLVENT_data))

{'vol_frac': 1}
<class 'dict'>


In [69]:
#Extracting column and values information for the SOLUTION_MAKEUP_SOLVENT table

pg_entry_solution_makeup_solvent, solution_makeup_solvent_columns, solution_makeup_solvent_values = convert_entry(SOLUTION_MAKEUP_SOLVENT_data)

print(solution_makeup_solvent_columns)
print(type(solution_makeup_solvent_columns))
print(solution_makeup_solvent_values)
print(type(solution_makeup_solvent_values))

dict_keys(['vol_frac'])
<class 'dict_keys'>
[1]
<class 'list'>


In [71]:
# Insert into SOLUTION_MAKEUP_SOLVENT table

import psycopg2

# Establish a connection to the PostgreSQL database
conn = pg.connect(**param_dict)

# Create a cursor object to interact with the database
cur = conn.cursor()

# Define the record values to be inserted
vol_frac = solution_makeup_solvent_values[0]

# Get the solution_id from the SOLUTION table
select_solution_query = '''
    SELECT solution_id FROM SOLUTION
'''

cur.execute(select_solution_query)
solution_result = cur.fetchone()
if solution_result is not None:
    solution_id = solution_result[0]

    # Get the solvent_id from the SOLVENT table
    select_solvent_query = '''
        SELECT pubchem_cid FROM SOLVENT
    '''

    cur.execute(select_solvent_query)
    solvent_result = cur.fetchone()
    if solvent_result is not None:
        solvent_id = solvent_result[0]

        # Define the SQL query to insert a record into the SOLUTION_MAKEUP_SOLVENT table
        insert_query = '''
            INSERT INTO SOLUTION_MAKEUP_SOLVENT (solution_id, solvent_id, vol_frac)
            VALUES (%s, %s, %s)
        '''

        # Execute the SQL query to insert the record
        cur.execute(insert_query, (solution_id, solvent_id, vol_frac))

        # Commit the changes to the database
        conn.commit()

# Close the cursor and connection
cur.close()
conn.close()


###### 2.5 Storing polymer Information in SOLUTION_MAKEUP_POLYMER table

In [73]:
#Extracting the wt fraction information for the SOLUTION_MAKEUP_POLYMER table

solution_makeup_polymer_desired_keys = ['wt_frac']

SOLUTION_MAKEUP_POLYMER_data = {key: polymer_data[key] for key in solution_makeup_polymer_desired_keys if key in polymer_data}

print(SOLUTION_MAKEUP_POLYMER_data)
print(type(SOLUTION_MAKEUP_POLYMER_data))

{'wt_frac': 1}
<class 'dict'>


In [74]:
#Extracting column and values information for the SOLUTION_MAKEUP_SOLVENT table

pg_entry_solution_makeup_polymer, solution_makeup_polymer_columns, solution_makeup_polymer_values = convert_entry(SOLUTION_MAKEUP_POLYMER_data)

print(solution_makeup_polymer_columns)
print(type(solution_makeup_polymer_columns))
print(solution_makeup_polymer_values)
print(type(solution_makeup_polymer_values))

dict_keys(['wt_frac'])
<class 'dict_keys'>
[1]
<class 'list'>


In [75]:
import psycopg2

# Establish a connection to the PostgreSQL database
conn = pg.connect(**param_dict)

# Create a cursor object to interact with the database
cur = conn.cursor()

# Define the record values to be inserted
wt_frac = solution_makeup_polymer_values[0]

# Get the solution_id from the SOLUTION table
select_solution_query = '''
    SELECT solution_id FROM SOLUTION
'''

cur.execute(select_solution_query)
solution_result = cur.fetchone()
if solution_result is not None:
    solution_id = solution_result[0]

    # Get the polymer_id from the POLYMER table
    select_polymer_query = '''
        SELECT polymer_id FROM POLYMER
    '''

    cur.execute(select_polymer_query)
    polymer_result = cur.fetchone()
    if polymer_result is not None:
        polymer_id = polymer_result[0]

        # Define the SQL query to insert a record into the SOLUTION_MAKEUP_POLYMER table
        insert_query = '''
            INSERT INTO SOLUTION_MAKEUP_POLYMER (solution_id, polymer_id, wt_frac)
            VALUES (%s, %s, %s)
        '''

        # Execute the SQL query to insert the record
        cur.execute(insert_query, (solution_id, polymer_id, wt_frac))

        # Commit the changes to the database
        conn.commit()

# Close the cursor and connection
cur.close()
conn.close()


### 3. Checking and Storing Device Information

In [77]:
import psycopg2
from psycopg2 import _json

device_fab_pg_entry, device_fab_columns, device_fab_values = convert_entry(device_fab)

#print(type(device_fab_pg_entry))
#print(type(device_fab_columns))
print(device_fab_columns)
print(type(device_fab_values))
print(device_fab_values)

<class 'dict'>
<class 'dict_keys'>
dict_keys(['params', 'meta'])
<class 'list'>
[<psycopg2._json.Json object at 0x000002A1122A7DA0>, <psycopg2._json.Json object at 0x000002A1122A7D68>]


In [78]:
sql = '''
    INSERT INTO DEVICE_FABRICATION (%s) 
    VALUES %s
    ON CONFLICT (params, meta) DO UPDATE
    SET (%s) = %s
    RETURNING device_fab_id
    
    '''
tup = (AsIs(','.join(device_fab_columns)), tuple(device_fab_values), AsIs(','.join(device_fab_columns)), tuple(device_fab_values))



device_fab_id = pg_query(sql, tup)
device_fab_id

Connecting to the PostgreSQL database...
Connection successful
Operation Successful


1

### 4. Checking and Storing Film Deposition Information 

In [79]:
import psycopg2
from psycopg2 import _json

coating_process_pg_entry, coating_process_columns, coating_process_values = convert_entry(coating_process)

#print(type(coating_process_pg_entry))
#print(type(coating_process_columns))
print(coating_process_columns)
print(type(coating_process_values))
print(coating_process_values)

dict_keys(['deposition_type'])
<class 'list'>
['spin']


In [None]:
## Doubt : In the create OFET DB code why is film_deposition_id in unique?

sql = '''
    INSERT INTO FILM_DEPOSITION (%s) 
    VALUES %s
    ON CONFLICT (deposition_type, params, meta) DO UPDATE
    SET (%s) = %s
    RETURNING device_fab_id
    
    '''
tup = (AsIs(','.join(coating_process_columns)), tuple(coating_process_values), AsIs(','.join(coating_process_columns)), tuple(coating_process_values))



film_deposition_id = pg_query(sql, tup)
film_deposition_id

### 5. Checking and Storing the subprocess recipes (Solution Treatment, Substrate Pretreatment, Post Process)

### 6. Checking and Storing information to the OFET_PROCESS TABLE

### 7. Checking and Storing information to the SAMPLE TABLE

### 8. Checking and Storing the measurement information 