## Python Code to Extract Data From Template and Transfer to PostGRE SQL
#### Authors : Aaron Liu, Rahul Venkatesh, Jessica Bonsu, Myeongyeon Lee 
##### Date Edited : 09-27-2023

In [1]:
## Required Packages

import pandas as pd
import numpy as np
import psycopg2 as pg

import os
from psycopg2.extras import Json
from psycopg2.extensions import AsIs
import functools
import json
import sys

import requests
# import bibtexparser
import pprint



In [2]:
## Required Functions To Extract Information from Template

# Function to remove rows that have no value (NaN) in the second column
def remove_emptyrows(df):
    nan_mask = ~df.iloc[:,1].isna() 
    return df[nan_mask]

# Function to convert a sheet into dictionary data type
def read_sheet(filepath, sheet_name, ordering=False, usecols="A,B,D", meas=False):

    ## NOTE: ADD AN ARGUMENT TO DECIDE WHETHER OR NOT TO BRACKET THE SHEET
    ## NOTE : The argument "ordering" is used for sheets like solution processing or substrate pretreatmant where the order of the processing step matters
    ## NOTE : The argument "usecols" is to store information from particular columns in the excel sheet
    ## NOTE : The argument "meas" is used to 
    
    ## Read Sheet Information
    df = pd.read_excel(
        filepath,
        sheet_name=sheet_name,
        usecols=usecols
    )
    
    # Call Function To Remove empty rows
    df_ = remove_emptyrows(df)
    
    # Create an empty dictionary
    sheet_dict = dict()

    # To account for sheets where processing order is important
    if ordering==True:
        df_list = split_df(df_) #calls function split_df
        for i, df in enumerate(df_list):
            sheet_dict[i] = table_to_dict(df) #adds each table to the dictionary
    else:
        sheet_dict = table_to_dict(df_)
    
    return sheet_dict #returns a dataframe

def split_df(df_):
    #For sheets where processing order is important, this function finds tables with '#' in the name of the first column title and turns it into a df
    
    split_idx_mask = df_.iloc[:,0].str.contains('#') #Find the object splits
    w = df_[split_idx_mask].index.values
    
    df_list = []
    
    for i in range(len(w)-1):
        next_df = df_.loc[w[i]+1:w[i+1]-1,:]
        df_list.append(next_df)    
    
    return df_list

def table_to_dict(df_):
    
    main_mask = pd.isna(df_.JSON) # it flags rows that dont have a value for JSON column
    step_dict = dict(df_[main_mask].iloc[:,:2].values) # Stores rows that have "NaN" for JSON column in df_ as dict

    
    
    for json_field in pd.unique(df_.JSON): #read through unique JSON types (e.g. NaN, meta or data)

        if pd.isna(json_field): #ignore fields with JSON type as NaN
            continue
            
        # dictionary to store information with JSON type "data"
        elif json_field=='data':
            data_mask = df_.JSON=='data'
            
            # lump key:value pairs into a second nested data dict
            step_dict['data'] = dict()
            
            for i, s in df_[data_mask].iterrows():
                step_dict['data'][s[s.index[0]]] = s['value':'error_type'].dropna().to_dict()
        else:
            json_mask = df_.JSON==json_field
            step_dict[json_field] = dict(df_[json_mask].iloc[:,:2].values) # creates a new key for JSON types like meta and params and adds its corresponding values to it 

    return step_dict

# f = pd.ExcelFile(fpath)


### Reading and Extracting Data From Sheets in Template

In [17]:
#Reading Data From Sheets in Template

#fpath = r'..\db_feed\P3HT_PMMA_Nanoporous_1_mms.xlsx' #Add path for template file
#fpath = r'..\db_feed\v6_example.xlsx' #Add path for template file
#fpath = r'..\db_feed\v6_example_blend.xlsx' #Add path for template file
#fpath = r'..\db_feed\v6_example_blend_2.xlsx' #Add path for template file
#fpath = r'..\db_feed\v6_example_4.xlsx' #Add path for template file

import os

# Directory path
directory_path = r'C:\Users\Aaron\Dropbox (GaTech)\OFET DB Data Entry\OFETDB-DataManagement\OFET-DB Seed data Edit\1 - remove old add new using template'

# Excel file name
excel_file_name = '338.xlsx'

# Full file path
fpath = os.path.join(directory_path, excel_file_name)


#Storing each sheet in the template file as a dictionary
exp_info = read_sheet(fpath, 'Data Origin')
solution_makeup = read_sheet(fpath, 'Solution Makeup', ordering=True)
solution_processing = read_sheet(fpath, 'Solution Treatment', ordering=True)
device_fab = read_sheet(fpath, 'Device Fabrication')
substrate_pretreat = read_sheet(fpath, 'Substrate Pretreat', ordering=True)
coating_process = read_sheet(fpath, 'Coating Process')
post_process = read_sheet(fpath, 'Post-Processing', ordering=True)
device_meas = read_sheet(fpath, 'Device Measurement', usecols="A:G", ordering=True)
other_meas = read_sheet(fpath, 'Other Measurements', usecols="A:G", ordering=True)

In [4]:
#Use this code block to check how each sheet has been converted to a dictionary
solution_processing 

{0: {'treatment_type': 'mixing',
  'process_step': 1,
  'params': {'environment': 'air', 'temperature': 50}}}

### Transferring Information From Template To PostgreSQL

In [5]:
# Postgres python
from psycopg2.extras import Json 

# Adapters necessary for converting python data types to PostgreSQL compatible data types 
def addapt_numpy_float64(numpy_float64):
    return AsIs(numpy_float64)

def addapt_numpy_int64(numpy_int64):
    return AsIs(numpy_int64)

def nan_to_null(f,
        _NULL=AsIs('NULL'),
        _Float=pg.extensions.Float):
    if not np.isnan(f):
        return _Float(f)
    return _NULL

pg.extensions.register_adapter(np.float64, addapt_numpy_float64)
pg.extensions.register_adapter(np.int64, addapt_numpy_int64)
pg.extensions.register_adapter(float, nan_to_null)

# param_dict = {
#     "host": "chbe-ofet-db.postgres.database.azure.com",
#     "database": "ofetdb_v1",
#     "user": "mg200_ofetdb",
#     "password": "DB4CIOEED23!",
#     "port": "5432",
# }

param_dict = {
    'database': 'ofetdb_testenv',
    'user': 'postgres',
    'password': 'Rahul2411!',
    'host': 'localhost',
    'port': '5432'
}

def connect(params_dict):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = pg.connect(**params_dict)
    except (Exception, pg.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def pg_query(sql, tup):
    fetched = None  # Initialize the 'fetched' variable
    
    try:
        # Database connection
        conn = connect(param_dict)
        cur = conn.cursor()
        
        
        
        # Pass SQL query, using string and placeholders
        cur.execute(sql, tup)
        
        # Fetch result
        fetched = cur.fetchone()[0]
        
        # Commit result
        conn.commit()
        print("Operation Successful")

        cur.close()
        conn.close()
        
    except (Exception, pg.DatabaseError) as error:
        # If database connection unsuccessful, then close connection 
        print("Error: %s" % error)
        conn.rollback()
        cur.close()
        conn.close()
    
    return fetched #return query result

In [6]:
from psycopg2.extras import Json

def convert_entry(entry_dict):
    
    #This function reads a dictionary and extracts the column names and values from it
    
    pg_entry = entry_dict
    for key in pg_entry.keys():
        if type(pg_entry[key])==dict:
            pg_entry[key]=Json(pg_entry[key])
    columns = pg_entry.keys()
    values = [pg_entry[column] for column in columns]
    
    return pg_entry, columns, values


In [7]:
import requests
import json
import pandas as pd
import bibtexparser
from pprint import pprint

# Given a valid doi string, return a dictionary of digital object information.
def doi2dict(doi):
    url = "http://dx.doi.org/" + doi
    headers = {"accept": "application/x-bibtex"}
    r = requests.get(url, headers=headers).text
    bibdata = bibtexparser.bparser.BibTexParser().parse(r)
    return bibdata.entries[0]


In [8]:
import psycopg2

# Define a list of tables and their primary key columns (only considering tables that have a primary key _id as type SERIAL)
tables = [
    {'name': 'EXPERIMENT_INFO', 'pk_column': 'exp_id'},
    {'name': 'SOLUTION', 'pk_column': 'solution_id'},
    {'name': 'POLYMER', 'pk_column': 'polymer_id'},
    {'name': 'SOLUTION_TREATMENT', 'pk_column': 'solution_treatment_id'},
    {'name': 'SOLUTION_TREATMENT_STEP', 'pk_column': 'solution_treatment_step_id'},
    {'name': 'DEVICE_FABRICATION', 'pk_column': 'device_fab_id'},
    {'name': 'SUBSTRATE_PRETREAT', 'pk_column': 'substrate_pretreat_id'},
    {'name': 'SUBSTRATE_PRETREAT_STEP', 'pk_column': 'substrate_pretreat_step_id'},
    {'name': 'FILM_DEPOSITION', 'pk_column': 'film_deposition_id'},
    {'name': 'POSTPROCESS', 'pk_column': 'postprocess_id'},
    {'name': 'POSTPROCESS_STEP', 'pk_column': 'postprocess_step_id'},
    {'name': 'OFET_PROCESS', 'pk_column': 'process_id'},
    {'name': 'SAMPLE', 'pk_column': 'sample_id'},
    {'name': 'MEASUREMENT', 'pk_column': 'measurement_id'}
]

# Connect to the PostgreSQL database
conn = pg.connect(**param_dict)

# Iterate over each table and update the primary key serial
for table in tables:
    table_name = table['name']
    pk_column = table['pk_column']
    
    cursor = conn.cursor()
    
    # Get the current maximum primary key value from the table
    cursor.execute(f"SELECT MAX({pk_column}) FROM {table_name}")
    result = cursor.fetchone()
    max_id = result[0]
    
    # Increment the maximum primary key value by one
    next_id = max_id + 1
    
    # Update the serial before entering the next record
    cursor.execute(f"ALTER SEQUENCE {table_name.lower()}_{pk_column}_seq RESTART WITH {next_id}")
    
    conn.commit()
    cursor.close()

# Close the database connection
conn.close()


### 1.Checking and Storing Experiment Information

In [None]:
exp_info

In [None]:
# fixing the sample_date field for literature field

import json
import datetime  # Import the datetime module

def custom_encoder(obj):
    if isinstance(obj, datetime.datetime):
        return obj.strftime('%m/%d/%Y')
    if isinstance(obj, psycopg2.extras.Json):
        return obj.adapted
    raise TypeError("Object of type {} is not JSON serializable".format(type(obj)))

# Convert the dictionary to JSON using the custom encoder
exp_info = json.dumps(exp_info, default=custom_encoder)

print(exp_info)

In [None]:
#converting exp_info back to dict

try:
    # Parse the JSON string into a dictionary
    exp_info = json.loads(exp_info)

    # Now exp_info_dict is a dictionary
    print(exp_info)
except json.JSONDecodeError as e:
    print("Error decoding JSON:", str(e))

In [None]:
# using the doi to extract additional information if citation type is 'literature'

if exp_info['citation_type'] == 'literature':
    doi = exp_info['meta']['doi']
    doi_info = doi2dict(doi)  # Fetch additional information using DOI

    # Add the additional information to the existing dictionary
    exp_info['meta'].update(doi_info)

# Print the updated dictionary
pprint(exp_info)

#### Note
- make sure to replace first_name, last_name, email with data_contributor
- potential schema for phase 3 DB. use a user_ID

In [None]:
import psycopg2
from psycopg2 import _json

exp_pg_entry, exp_columns, exp_values = convert_entry(exp_info)

#print(type(pg_entry))
#print(type(columns))
#print(exp_columns)
#print(type(values))
#print(values)

exp_columns

In [None]:
sql = '''
    INSERT INTO experiment_info (%s) 
    VALUES %s
    ON CONFLICT (citation_type, meta) DO UPDATE
    SET (%s) = %s
    RETURNING exp_id
    
    '''
tup = (AsIs(','.join(exp_columns)), tuple(exp_values), AsIs(','.join(exp_columns)), tuple(exp_values))



exp_id = pg_query(sql, tup)
exp_id


### 2.Checking and Storing Solution Information (Polymer, Solvent, Solution)

In [None]:
solution_makeup

In [None]:
import psycopg2
from psycopg2 import _json

pg_entry_solution_makeup, columns_solution_makeup, values_solution_makeup = convert_entry(solution_makeup)

print(values_solution_makeup)
print(type(values_solution_makeup))

In [None]:
# Storing Solution data

solution_data = values_solution_makeup[0].adapted

solution_data

In [None]:
# Storing Solvent data - accounting for multiple solvents
solvent_data_filtered = [json_obj for json_obj in values_solution_makeup if json_obj.adapted.get("entity_type") == "solvent"]

# Convert psycopg2._json.Json objects to JSON strings
solvent_data = [json_obj.adapted for json_obj in solvent_data_filtered]

solvent_data

In [None]:
# Storing Polymer data - accounting for multiple polymers
polymer_data_filtered = [json_obj for json_obj in values_solution_makeup if json_obj.adapted.get("entity_type") == "polymer"]

# Convert psycopg2._json.Json objects to JSON strings
polymer_data = [json_obj.adapted for json_obj in polymer_data_filtered]

polymer_data

In [None]:
## if there is no value for PDI or Mn for the Insulating polymer you can add empty values in this step. if there is do nothing.

for data in polymer_data:
    if data['entity_type'] == 'polymer':
        # Check if 'mn' and 'dispersity' are missing and set them to None
        if 'mn' not in data:
            data['mn'] = None
        if 'dispersity' not in data:
            data['dispersity'] = None
            
print (polymer_data)

In [None]:
# Storing Solution Makeup data

solution_makeup_data = []
solution_makeup_data.append(solution_data)
solution_makeup_data.append(solvent_data)
solution_makeup_data.append(polymer_data)
solution_makeup_data

In [None]:
# Extract solution information
solution_data = solution_makeup_data[0]
concentration = solution_data['concentration']

# Extract solvent information
solvent_data = solution_makeup_data[1]
solvent_ids = []
vol_fracs = []
for solvent in solvent_data:
    pubchem_cid = solvent['pubchem_cid']
    iupac_name = solvent['iupac_name']
    vol_frac = solvent['vol_frac']
    solvent_ids.append((pubchem_cid, iupac_name))
    vol_fracs.append(vol_frac)

# Extract polymer information
polymer_data = solution_makeup_data[2]
polymer_ids = []
wt_fracs = []
for polymer in polymer_data:
    common_name = polymer['common_name']
    iupac_name = polymer['iupac_name']
    mn = polymer['mn']
    mw = polymer['mw']
    dispersity = polymer['dispersity']
    wt_frac = polymer['wt_frac']
    meta = json.dumps(polymer['meta'])
    polymer_ids.append((common_name, iupac_name, mn, mw, dispersity, meta))
    wt_fracs.append(wt_frac)
    
solvent_ids

##### Inserting into POLYMER, SOLVENT, SOLUTION, SOLUTION_MAKEUP_POLYMER, SOLUTION_MAKEUP_SOLVENT tables

This code should handle multiple solvents each with a vol_frac and multiple polymers each with a wt_frac, and it will check for the existence of a unique combination of concentration, polymer IDs, solvent IDs, wt_fracs, and vol_fracs. If the combination exists, it will assign the existing solution_id in all tables; otherwise, it will create a new solution_id.

In [None]:
import json
import psycopg2

# Establish a connection to the database
connection = psycopg2.connect(**param_dict)

# Create a cursor object to execute SQL commands
cursor = connection.cursor()

# Extract solution information
solution_data = solution_makeup_data[0]
concentration = solution_data['concentration']

# Extract solvent information
solvent_data = solution_makeup_data[1]
solvent_ids = []
vol_fracs = []
for solvent in solvent_data:
    pubchem_cid = solvent['pubchem_cid']
    iupac_name = solvent['iupac_name']
    vol_frac = solvent['vol_frac']
    solvent_ids.append((pubchem_cid, iupac_name))
    vol_fracs.append(vol_frac)

# Extract polymer information
polymer_data = solution_makeup_data[2]
polymer_ids = []
wt_fracs = []
for polymer in polymer_data:
    common_name = polymer['common_name']
    iupac_name = polymer['iupac_name']
    mn = polymer['mn']
    mw = polymer['mw']
    dispersity = polymer['dispersity']
    wt_frac = polymer['wt_frac']
    meta = json.dumps(polymer['meta'])
    polymer_ids.append((common_name, iupac_name, mn, mw, dispersity, meta))
    wt_fracs.append(wt_frac)

# Start transaction
with connection:
    with connection.cursor() as cursor:
        try:
            # Check if the unique combination exists
            select_solution_id_sql = '''
                SELECT sm.solution_id
                FROM SOLUTION_MAKEUP_SOLVENT sms
                JOIN SOLUTION_MAKEUP_POLYMER smp ON sms.solution_id = smp.solution_id
                JOIN SOLVENT s ON sms.solvent_id = s.pubchem_cid
                JOIN POLYMER p ON smp.polymer_id = p.polymer_id
                JOIN SOLUTION sm ON sms.solution_id = sm.solution_id
                WHERE sm.concentration = %s
                AND (s.pubchem_cid, s.iupac_name) IN %s
                AND (p.common_name, p.iupac_name, p.mn, p.mw, p.dispersity, p.meta) IN %s
                GROUP BY sm.solution_id
                HAVING COUNT(DISTINCT smp.polymer_id) = %s
                AND COUNT(DISTINCT sms.solvent_id) = %s
                AND ARRAY_AGG(sms.vol_frac) = %s::double precision[]
                AND ARRAY_AGG(smp.wt_frac) = %s::double precision[]
            '''

            cursor.execute(select_solution_id_sql, (concentration, tuple(solvent_ids), tuple(polymer_ids), len(polymer_ids), len(solvent_ids), vol_fracs, wt_fracs))
            existing_solution = cursor.fetchone()
            
            #Checking if there is existing solution
            if existing_solution:
                solution_id = existing_solution[0]
            else:
                # Insert into SOLUTION table
                insert_solution_sql = '''
                    INSERT INTO SOLUTION (concentration)
                    VALUES (%s)
                    RETURNING solution_id
                '''
                cursor.execute(insert_solution_sql, (concentration,))
                solution_id = cursor.fetchone()[0]

            #Reading Solvent data    
            for solvent_id, vol_frac in zip(solvent_ids, vol_fracs):
                pubchem_cid, iupac_name = solvent_id
                
                # Check if the solvent exists
                select_solvent_id_sql = '''
                    SELECT pubchem_cid
                    FROM SOLVENT
                    WHERE iupac_name = %s
                '''
                cursor.execute(select_solvent_id_sql, (iupac_name,))
                existing_solvent = cursor.fetchone()

                if existing_solvent:
                    solvent_id = existing_solvent[0]
                else:
                    # Insert into SOLVENT table
                    insert_solvent_sql = '''
                        INSERT INTO SOLVENT (pubchem_cid, iupac_name)
                        VALUES (%s, %s)
                        RETURNING pubchem_cid
                    '''
                    cursor.execute(insert_solvent_sql, (pubchem_cid, iupac_name))
                    solvent_id = cursor.fetchone()[0]
                    
                # Insert or update SOLUTION_MAKEUP_SOLVENT table
                insert_solution_makeup_solvent_sql = '''
                    INSERT INTO SOLUTION_MAKEUP_SOLVENT (solution_id, solvent_id, vol_frac)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (solution_id, solvent_id, vol_frac) DO UPDATE
                    SET solution_id = SOLUTION_MAKEUP_SOLVENT.solution_id,
                        solvent_id = SOLUTION_MAKEUP_SOLVENT.solvent_id,
                        vol_frac = SOLUTION_MAKEUP_SOLVENT.vol_frac
                '''
                cursor.execute(insert_solution_makeup_solvent_sql, (solution_id, solvent_id, vol_frac))


            #Reading the polymer data    
            for polymer_id, wt_frac in zip(polymer_ids, wt_fracs):
                common_name, iupac_name, mn, mw, dispersity, meta = polymer_id

                # Check if the polymer exists
                select_polymer_id_sql = '''
                    SELECT polymer_id
                    FROM POLYMER
                    WHERE common_name = %s
                    AND iupac_name = %s
                    AND mn = %s
                    AND mw = %s
                    AND dispersity = %s
                    AND meta = %s::jsonb
                '''
                cursor.execute(select_polymer_id_sql, (common_name, iupac_name, mn, mw, dispersity, meta))
                existing_polymer = cursor.fetchone()

                if existing_polymer:
                    polymer_id = existing_polymer[0]
                else:
                    # Insert into POLYMER table
                    insert_polymer_sql = '''
                        INSERT INTO POLYMER (common_name, iupac_name, mn, mw, dispersity, meta)
                        VALUES (%s, %s, %s, %s, %s, %s::jsonb)
                        RETURNING polymer_id
                    '''
                    cursor.execute(insert_polymer_sql, (common_name, iupac_name, mn, mw, dispersity, meta))
                    polymer_id = cursor.fetchone()[0]

                # Insert or update SOLUTION_MAKEUP_POLYMER table
                insert_solution_makeup_polymer_sql = '''
                    INSERT INTO SOLUTION_MAKEUP_POLYMER (solution_id, polymer_id, wt_frac)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (solution_id, polymer_id, wt_frac) DO UPDATE
                    SET solution_id = SOLUTION_MAKEUP_POLYMER.solution_id,
                        polymer_id = SOLUTION_MAKEUP_POLYMER.polymer_id,
                        wt_frac = SOLUTION_MAKEUP_POLYMER.wt_frac
                '''
                cursor.execute(insert_solution_makeup_polymer_sql, (solution_id, polymer_id, wt_frac))

            connection.commit()

            print("Solution makeup saved successfully!")
            print(solution_id)
        except Exception as e:
            connection.rollback()
            print("An error occurred:", str(e))

# Close the database connection
connection.close()


### 3. Checking and Storing Device Information

In [None]:
device_fab

In [None]:
import psycopg2
from psycopg2 import _json

device_fab_pg_entry, device_fab_columns, device_fab_values = convert_entry(device_fab)

#print(type(device_fab_pg_entry))
print(type(device_fab_columns))
print(device_fab_columns)
print(type(device_fab_values))
print(device_fab_values)

In [None]:
#If meta information is missing
device_fab_columns_list = list(device_fab_columns)  # Convert dict_keys to a list

if 'meta' not in device_fab_columns_list:
    device_fab_columns_list.append('meta')
    device_fab_values.append({})
    
device_fab_values = [json.dumps(value) if isinstance(value, dict) else value for value in device_fab_values]

In [None]:
sql = '''
    INSERT INTO DEVICE_FABRICATION (%s) 
    VALUES %s
    ON CONFLICT (params, meta) DO UPDATE
    SET (%s) = %s
    RETURNING device_fab_id
    
    '''
tup = (AsIs(','.join(device_fab_columns_list)), tuple(device_fab_values), AsIs(','.join(device_fab_columns_list)), tuple(device_fab_values))



device_fab_id = pg_query(sql, tup)
device_fab_id

### 4. Checking and Storing Film Deposition Information 

In [None]:
import psycopg2
from psycopg2 import _json

coating_process_pg_entry, coating_process_columns, coating_process_values = convert_entry(coating_process)

#print(type(coating_process_pg_entry))
print(type(coating_process_columns))
print(coating_process_columns)
print(type(coating_process_values))
print(coating_process_values)

In [None]:
#If meta information is missing
coating_process_columns_list = list(coating_process_columns)  # Convert dict_keys to a list

if 'meta' not in coating_process_columns_list:
    coating_process_columns_list.append('meta')
    coating_process_values.append({})
    
coating_process_values = [json.dumps(value) if isinstance(value, dict) else value for value in coating_process_values]

In [None]:
coating_process_values

In [None]:

sql = '''
    INSERT INTO FILM_DEPOSITION (%s) 
    VALUES %s
    ON CONFLICT (deposition_type, params, meta) DO UPDATE
    SET (%s) = %s
    RETURNING film_deposition_id
    
    '''
tup = (AsIs(','.join(coating_process_columns_list)), tuple(coating_process_values), AsIs(','.join(coating_process_columns_list)), tuple(coating_process_values))



film_deposition_id = pg_query(sql, tup)
film_deposition_id

### 5. Checking and Storing the subprocess recipes (Solution Treatment, Substrate Pretreatment, Post Process)

###### 5.1 SOLUTION TREATMENT

In [None]:
# ### SQL QUERY TO INSERT NEW TABLE SOLUTION_TREATMENT_SEQUENCE AND INSERT VALUES INTO IT
# -- Create the SOLUTION_TREATMENT_SEQUENCE table
# CREATE TABLE IF NOT EXISTS SOLUTION_TREATMENT_SEQUENCE (
#     solution_treatment_id SERIAL PRIMARY KEY,
#     solution_treatment_sequence integer[],
#     FOREIGN KEY (solution_treatment_id) REFERENCES SOLUTION_TREATMENT (solution_treatment_id)
# );

# -- Insert data into the SOLUTION_TREATMENT_SEQUENCE table
# INSERT INTO SOLUTION_TREATMENT_SEQUENCE (solution_treatment_id, solution_treatment_sequence)
# SELECT
#     solution_treatment_id,
#     ARRAY_AGG(solution_treatment_step_id) AS solution_treatment_sequence
# FROM
#     SOLUTION_TREATMENT_ORDER
# GROUP BY
#     solution_treatment_id;

In [18]:
solution_processing

{0: {'treatment_type': 'mixing',
  'process_step': 1,
  'params': {'environment': 'air', 'temperature': 50}},
 1: {'treatment_type': 'sonication',
  'process_step': 2,
  'params': {'environment': 'air', 'time': 2.5},
  'meta': {'equipment_model': 'Bransonic 2510',
   'frequency': 40,
   'intensity': 130}}}

In [20]:
import json
import psycopg2

# Function to insert data into SOLUTION_TREATMENT_STEP table
def insert_into_solution_treatment_step(cur, treatment_type, params, meta):
    # Check if the record already exists
    cur.execute(
        "SELECT solution_treatment_step_id FROM SOLUTION_TREATMENT_STEP WHERE treatment_type = %s AND params = %s::jsonb AND meta = %s::jsonb",
        (treatment_type, params, meta)
    )
    existing_id = cur.fetchone()

    if existing_id:
        solution_treatment_step_id = existing_id[0]
    else:
        #Insert data into SOLUTION_TREATMENT_STEP table
        cur.execute(
            "INSERT INTO SOLUTION_TREATMENT_STEP (treatment_type, params, meta) VALUES (%s, %s::jsonb, %s::jsonb) RETURNING solution_treatment_step_id",
            (treatment_type, params, meta)
        )
        solution_treatment_step_id = cur.fetchone()[0]

    return solution_treatment_step_id

# Function to insert data into SOLUTION_TREATMENT, SOLUTION_TREATMENT_ORDER and SOLUTION_TREATMENT_SEQUENCE tables
def insert_into_solution_treatment_order_and_sequence(cur, step_ids, process_order_list):
    # Check if the combination of step IDs already exists in the SOLUTION_TREATMENT_SEQUENCE table
    cur.execute(
        """
        SELECT solution_treatment_id
        FROM SOLUTION_TREATMENT_SEQUENCE
        WHERE solution_treatment_sequence = %s
        """,
        (step_ids,)
    )
    existing_id = cur.fetchone()

    if existing_id:
        # Use the existing solution_treatment_id
        solution_treatment_id = existing_id[0]
    else:
        # Insert a new solution_treatment_id into SOLUTION_TREATMENT table
        cur.execute(
            """
            INSERT INTO SOLUTION_TREATMENT (solution_treatment_id)
            VALUES (nextval('solution_treatment_solution_treatment_id_seq'))
            RETURNING solution_treatment_id;
            """
        )
        solution_treatment_id = cur.fetchone()[0]

        # Insert data into SOLUTION_TREATMENT_ORDER table
        data = [(solution_treatment_id, step_id, process_order) for step_id, process_order in zip(step_ids, process_order_list)]
        cur.executemany(
            """
            INSERT INTO SOLUTION_TREATMENT_ORDER (solution_treatment_id, solution_treatment_step_id, process_order)
            VALUES (%s, %s, %s);
            """,
            data
        )

        # Insert the sequence into SOLUTION_TREATMENT_SEQUENCE table
        cur.execute(
            """
            INSERT INTO SOLUTION_TREATMENT_SEQUENCE (solution_treatment_id, solution_treatment_sequence)
            VALUES (%s, %s);
            """,
            (solution_treatment_id, step_ids)
        )

    return solution_treatment_id  # Return the solution_treatment_id, whether existing or newly generated

# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(**param_dict)

# Create a cursor object to interact with the database
cur = conn.cursor()

# Initialize solution_treatment_id
solution_treatment_id = None

# Initialize lists to store step IDs and process orders for this combination
step_ids = []
process_order_list = []

# Check if solution_processing is empty
if solution_processing:
    for treatment in solution_processing.values():
        # Convert params and meta to JSON format
        params_json = json.dumps(treatment.get('params', {}))
        meta_json = json.dumps(treatment.get('meta', {}))
        treatment_type = treatment['treatment_type']

        # Insert data into SOLUTION_TREATMENT_STEP table
        solution_treatment_step_id = insert_into_solution_treatment_step(cur, treatment_type, params_json, meta_json)

        # Append step ID and process order to the lists
        step_ids.append(solution_treatment_step_id)
        process_order_list.append(treatment['process_step'])

    # Insert data into SOLUTION_TREATMENT, SOLUTION_TREATMENT_ORDER and SOLUTION_TREATMENT_SEQUENCE table
    solution_treatment_id = insert_into_solution_treatment_order_and_sequence(cur, step_ids, process_order_list)


    # Commit the changes to the database
    print("Solution treatment saved successfully with id:", solution_treatment_id)
    conn.commit()
    
else:
    print("solution_processing is empty. No database operations performed.")
    
    
# Close the cursor and connection
cur.close()
conn.close()


Solution treatment saved successfully with id: 11


###### 5.2 SUBSTRATE PRETREATMENT

In [None]:
# # ### SQL QUERY TO INSERT NEW TABLE SUBSTRATE_PRETREAT_SEQUENCE AND INSERT VALUES INTO IT
# -- Create the SUBSTRATE_PRETREAT_SEQUENCE table
# CREATE TABLE IF NOT EXISTS SUBSTRATE_PRETREAT_SEQUENCE (
#     substrate_pretreat_id SERIAL PRIMARY KEY,
#     substrate_pretreat_sequence integer[],
#     FOREIGN KEY (substrate_pretreat_id) REFERENCES SUBSTRATE_PRETREAT (substrate_pretreat_id)
# );

# -- Insert data into the SUBSTRATE_PRETREAT_SEQUENCE table
# INSERT INTO SUBSTRATE_PRETREAT_SEQUENCE (substrate_pretreat_id, substrate_pretreat_sequence)
# SELECT
#     substrate_pretreat_id,
#     ARRAY_AGG(substrate_pretreat_step_id) AS substrate_pretreat_sequence
# FROM
#     SUBSTRATE_PRETREAT_ORDER
# GROUP BY
#     substrate_pretreat_id;


In [21]:
substrate_pretreat

{0: {'treatment_type': 'chemical_treat',
  'process_step': 1,
  'params': {'environment': 'air', 'iupac_name': 'piranha solution'},
  'meta': {'description': '4 part sulfuric acid : 1 part hydrogen peroxide'}},
 1: {'treatment_type': 'chemical_treat',
  'process_step': 2,
  'params': {'environment': 'air', 'iupac_name': 'acetone'},
  'meta': {'description': 'Ultrasonic bath'}},
 2: {'treatment_type': 'chemical_treat',
  'process_step': 3,
  'params': {'environment': 'air', 'iupac_name': 'isopropanol'},
  'meta': {'description': 'Ultrasonic bath'}},
 3: {'treatment_type': 'chemical_treat',
  'process_step': 4,
  'params': {'environment': 'air', 'iupac_name': 'water'},
  'meta': {'description': 'Ultrasonic bath'}},
 4: {'treatment_type': 'drying',
  'process_step': 5,
  'params': {'environment': 'air', 'temperature': 150, 'time': 0.5}}}

In [23]:
import json
import psycopg2

# Function to insert data into SUBSTRATE_PRETREAT_STEP table
def insert_into_substrate_pretreat_step(cur, treatment_type, params, meta):
    # Check if the record already exists
    cur.execute(
        "SELECT substrate_pretreat_step_id FROM SUBSTRATE_PRETREAT_STEP WHERE treatment_type = %s AND params = %s::jsonb AND meta = %s::jsonb",
        (treatment_type, params, meta)
    )
    existing_id = cur.fetchone()

    if existing_id:
        substrate_pretreat_step_id = existing_id[0]
    else:
        # Insert data into SUBSTRATE_PRETREAT_STEP table
        cur.execute(
            "INSERT INTO SUBSTRATE_PRETREAT_STEP (treatment_type, params, meta) VALUES (%s, %s::jsonb, %s::jsonb) RETURNING substrate_pretreat_step_id",
            (treatment_type, params, meta)
        )
        substrate_pretreat_step_id = cur.fetchone()[0]

    return substrate_pretreat_step_id

# Function to insert data into SUBSTRATE_PRETREAT, SUBSTRATE_PRETREAT_ORDER and SUBSTRATE_PRETREAT_SEQUENCE tables
def insert_into_substrate_pretreat_order_and_sequence(cur, step_ids, process_order_list):
    # Check if the combination of step IDs already exists in the SUBSTRATE_PRETREAT_SEQUENCE table
    cur.execute(
        """
        SELECT substrate_pretreat_id
        FROM SUBSTRATE_PRETREAT_SEQUENCE
        WHERE substrate_pretreat_sequence = %s
        """,
        (step_ids,)
    )
    existing_id = cur.fetchone()

    if existing_id:
        # Use the existing substrate_pretreat_id
        substrate_pretreat_id = existing_id[0]
    else:
        # Insert a new substrate_pretreat_id into SUBSTRATE_PRETREAT table
        cur.execute(
            """
            INSERT INTO SUBSTRATE_PRETREAT (substrate_pretreat_id)
            VALUES (nextval('substrate_pretreat_substrate_pretreat_id_seq'))
            RETURNING substrate_pretreat_id;
            """
        )
        substrate_pretreat_id = cur.fetchone()[0]

        # Insert data into SUBSTRATE_PRETREAT_ORDER table
        data = [(substrate_pretreat_id, step_id, process_order) for step_id, process_order in zip(step_ids, process_order_list)]
        cur.executemany(
            """
            INSERT INTO SUBSTRATE_PRETREAT_ORDER (substrate_pretreat_id, substrate_pretreat_step_id, process_order)
            VALUES (%s, %s, %s);
            """,
            data
        )

        # Insert the sequence into SUBSTRATE_PRETREAT_SEQUENCE table
        cur.execute(
            """
            INSERT INTO SUBSTRATE_PRETREAT_SEQUENCE (substrate_pretreat_id, substrate_pretreat_sequence)
            VALUES (%s, %s);
            """,
            (substrate_pretreat_id, step_ids)
        )

    return substrate_pretreat_id  # Return the substrate_pretreat_id, whether existing or newly generated

# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(**param_dict)

# Create a cursor object to interact with the database
cur = conn.cursor()

# Initialize substrate_pretreat_id
substrate_pretreat_id = None

# Initialize lists to store step IDs and process orders for this combination
step_ids = []
process_order_list = []

# Check if substrate_pretreat_processing is empty
if substrate_pretreat:
    for treatment in substrate_pretreat.values():
        # Convert params and meta to JSON format
        params_json = json.dumps(treatment.get('params', {}))
        meta_json = json.dumps(treatment.get('meta', {}))
        treatment_type = treatment['treatment_type']

        # Insert data into SUBSTRATE_PRETREAT_STEP table
        substrate_pretreat_step_id = insert_into_substrate_pretreat_step(cur, treatment_type, params_json, meta_json)

        # Append step ID and process order to the lists
        step_ids.append(substrate_pretreat_step_id)
        process_order_list.append(treatment['process_step'])

    # Insert data into SUBSTRATE_PRETREAT, SUBSTRATE_PRETREAT_ORDER and SUBSTRATE_PRETREAT_SEQUENCE table
    substrate_pretreat_id = insert_into_substrate_pretreat_order_and_sequence(cur, step_ids, process_order_list)

    # Commit the changes to the database
    print("Substrate pretreat saved successfully with id:", substrate_pretreat_id)
    conn.commit()

else:
    print("substrate_pretreat_processing is empty. No database operations performed.")

# Close the cursor and connection
cur.close()
conn.close()


Substrate pretreat saved successfully with id: 2


###### 5.3 POST PROCESSING TREATMENT

In [None]:
# ### SQL QUERY TO INSERT NEW TABLE POSTPROCESS_SEQUENCE AND INSERT VALUES INTO IT
# -- Create the POSTPROCESS_SEQUENCE table
# CREATE TABLE IF NOT EXISTS POSTPROCESS_SEQUENCE (
#     postprocess_id SERIAL PRIMARY KEY,
#     postprocess_sequence integer[],
#     FOREIGN KEY (postprocess_id) REFERENCES POSTPROCESS (postprocess_id)
# );

# -- Insert data into the POSTPROCESS_SEQUENCE table
# INSERT INTO POSTPROCESS_SEQUENCE (postprocess_id, postprocess_sequence)
# SELECT
#     postprocess_id,
#     ARRAY_AGG(postprocess_step_id) AS postprocess_sequence
# FROM
#     POSTPROCESS_ORDER
# GROUP BY
#     postprocess_id;


In [24]:
post_process

{}

In [26]:
import json
import psycopg2

# Function to insert data into POSTPROCESS_STEP table
def insert_into_postprocess_step(cur, treatment_type, params, meta):
    # Check if the record already exists
    cur.execute(
        "SELECT postprocess_step_id FROM POSTPROCESS_STEP WHERE treatment_type = %s AND params = %s::jsonb AND meta = %s::jsonb",
        (treatment_type, params, meta)
    )
    existing_id = cur.fetchone()

    if existing_id:
        postprocess_step_id = existing_id[0]
    else:
        # Insert data into POSTPROCESS_STEP table
        cur.execute(
            "INSERT INTO POSTPROCESS_STEP (treatment_type, params, meta) VALUES (%s, %s::jsonb, %s::jsonb) RETURNING postprocess_step_id",
            (treatment_type, params, meta)
        )
        postprocess_step_id = cur.fetchone()[0]

    return postprocess_step_id

# Function to insert data into POSTPROCESS_ORDER and POSTPROCESS_SEQUENCE tables
def insert_into_postprocess_order_and_sequence(cur, step_ids, process_order_list):
    # Check if the combination of step IDs already exists in the POSTPROCESS_SEQUENCE table
    cur.execute(
        """
        SELECT postprocess_id
        FROM POSTPROCESS_SEQUENCE
        WHERE postprocess_sequence = %s
        """,
        (step_ids,)
    )
    existing_id = cur.fetchone()

    if existing_id:
        # Use the existing postprocess_id
        postprocess_id = existing_id[0]
    else:
        # Insert a new postprocess_id into POSTPROCESS table
        cur.execute(
            """
            INSERT INTO POSTPROCESS (postprocess_id)
            VALUES (nextval('postprocess_postprocess_id_seq'))
            RETURNING postprocess_id;
            """
        )
        postprocess_id = cur.fetchone()[0]

        # Insert data into POSTPROCESS_ORDER table
        data = [(postprocess_id, step_id, process_order) for step_id, process_order in zip(step_ids, process_order_list)]
        cur.executemany(
            """
            INSERT INTO POSTPROCESS_ORDER (postprocess_id, postprocess_step_id, process_order)
            VALUES (%s, %s, %s);
            """,
            data
        )

        # Insert the sequence into POSTPROCESS_SEQUENCE table
        cur.execute(
            """
            INSERT INTO POSTPROCESS_SEQUENCE (postprocess_id, postprocess_sequence)
            VALUES (%s, %s);
            """,
            (postprocess_id, step_ids)
        )

    return postprocess_id  # Return the postprocess_id, whether existing or newly generated

# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(**param_dict)

# Create a cursor object to interact with the database
cur = conn.cursor()

# Initialize postprocess_id
postprocess_id = None

# Initialize lists to store step IDs and process orders for this combination
step_ids = []
process_order_list = []

# Check if solution_processing is empty
if post_process:
    for treatment in post_process.values():
        # Convert params and meta to JSON format
        params_json = json.dumps(treatment.get('params', {}))
        meta_json = json.dumps(treatment.get('meta', {}))
        treatment_type = treatment['treatment_type']

        # Insert data into POSTPROCESS_STEP table
        postprocess_step_id = insert_into_postprocess_step(cur, treatment_type, params_json, meta_json)

        # Append step ID and process order to the lists
        step_ids.append(postprocess_step_id)
        process_order_list.append(treatment['process_step'])

    # Insert data into POSTPROCESS_ORDER and POSTPROCESS_SEQUENCE tables
    postprocess_id = insert_into_postprocess_order_and_sequence(cur, step_ids, process_order_list)

    # Commit the changes to the database
    print("Post-process saved successfully with id:", postprocess_id)
    conn.commit()
    
else:
    print("solution_processing is empty. No database operations performed.")
    
    
# Close the cursor and connection
cur.close()
conn.close()


solution_processing is empty. No database operations performed.


### 6. Checking and Storing information to the OFET_PROCESS TABLE and generating process_id

In [None]:
#printing the id's of attributes in OFET_PROCESS
print("solution_id is : {}".format(solution_id))
print("device_fab_id is : {}".format(device_fab_id))
print("solution_treatment_id is : {}".format(solution_treatment_id))
print("substrate_pretreat_id is : {}".format(substrate_pretreat_id))
print("film_deposition_id is : {}".format(film_deposition_id))
print("postprocess_id is : {}".format(postprocess_id))

ofet_process_columns = ['solution_id','solution_treatment_id','device_fab_id','substrate_pretreat_id','film_deposition_id','postprocess_id']
ofet_process_values = [solution_id,solution_treatment_id,device_fab_id,substrate_pretreat_id,film_deposition_id,postprocess_id]



In [None]:
sql = '''
    INSERT INTO ofet_process (%s) 
    VALUES %s
    ON CONFLICT (solution_id, solution_treatment_id, device_fab_id, substrate_pretreat_id, film_deposition_id, postprocess_id) DO UPDATE
    SET (%s) = %s
    RETURNING process_id
    
    '''
tup = (AsIs(','.join(ofet_process_columns)), tuple(ofet_process_values), AsIs(','.join(ofet_process_columns)), tuple(ofet_process_values))

process_id = pg_query(sql, tup)
process_id


### 7. Checking and Storing information to the SAMPLE TABLE and generating sample_id

In [None]:
#printing the id's of attributes in SAMPLE
print("exp_id is : {}".format(exp_id))
print("process_id is : {}".format(process_id))


In [None]:
sample_columns = ['exp_id','process_id','meta']

sample_values = [exp_id,process_id,'{}']


sample_values = [json.dumps(value) if isinstance(value, dict) else value for value in sample_values]
sample_values

sql = '''
    INSERT INTO sample (%s) 
    VALUES %s
    ON CONFLICT (exp_id, process_id, meta) DO UPDATE
    SET (%s) = %s
    RETURNING sample_id
    
    '''
tup = (AsIs(','.join(sample_columns)), tuple(sample_values), AsIs(','.join(sample_columns)), tuple(sample_values))

sample_id = pg_query(sql, tup)
sample_id

### 8. Checking and Storing the measurement information 

#### 8.1 Storing Device Measurement Information 

In [None]:
device_meas

In [None]:
import psycopg2
from psycopg2 import _json

device_meas_pg_entry, device_meas_columns, device_meas_values = convert_entry(device_meas[0])

#print(type(coating_process_pg_entry))
#print(type(coating_process_columns))
print(device_meas_columns)
#print(type(coating_process_values))
print(device_meas_values)

In [None]:


device_meas_columns_list = list(device_meas_columns)  # Convert dict_keys to a list
device_meas_columns_list.insert(0, 'sample_id')
device_meas_values.insert(0, sample_id)

# print(device_meas_columns_list)
# print(device_meas_values)


#If meta information is missing
if 'meta' not in device_meas_columns_list:
    device_meas_columns_list.append('meta')
    device_meas_values.append({})



device_meas_values = [json.dumps(value) if isinstance(value, dict) else value for value in device_meas_values]


In [None]:
sql = '''
    INSERT INTO measurement (%s) 
    VALUES %s
    ON CONFLICT (sample_id,measurement_type,data,meta) DO UPDATE
    SET (%s) = %s
    RETURNING measurement_id
    
    '''
tup = (AsIs(','.join(device_meas_columns_list)), tuple(device_meas_values), AsIs(','.join(device_meas_columns_list)), tuple(device_meas_values))

measurement_id = pg_query(sql, tup)
measurement_id

#### 8.2 Storing Other Measurement Information 

In [None]:
other_meas

In [None]:
for items in other_meas:
    other_meas_pg_entry, other_meas_columns, other_meas_values = convert_entry(other_meas[items])

    other_meas_columns_list = list(other_meas_columns)  # Convert dict_keys to a list
    other_meas_columns_list.insert(0, 'sample_id')
    other_meas_values.insert(0, sample_id)

    #If meta information is missing
    if 'meta' not in other_meas_columns_list:
        other_meas_columns_list.append('meta')
        other_meas_values.append({})



    other_meas_values = [json.dumps(value) if isinstance(value, dict) else value for value in other_meas_values]
    
    sql = '''
    INSERT INTO measurement (%s) 
    VALUES %s
    ON CONFLICT (sample_id,measurement_type,data,meta) DO UPDATE
    SET (%s) = %s
    RETURNING measurement_id
    
    '''
    tup = (AsIs(','.join(other_meas_columns_list)), tuple(other_meas_values), AsIs(','.join(other_meas_columns_list)), tuple(other_meas_values))

    measurement_id = pg_query(sql, tup)
    print(measurement_id)