# Import libraries

In [11]:
# ########################################################## #
#                                                            #
# Name: KEV:Constant Evaluator                               #
# Author: GGamov                                             #
# Date: 2019                                                 #
#                                                            #
# ########################################################## #

# import libraries -------------------------------------------

import math
import numpy as np
from copy import copy, deepcopy
import pandas as pd
from collections import Counter
from openpyxl import load_workbook
import re
import io

# Functions

## Load data

In [12]:
# basic input ------------------------------------------------

def eq_scripts_load(sep = ';', subdir = r"", file = r"file.xlsx"):
    
    # if specific file selected it should be XLSX one
    if file != "":
        
        if subdir != '':
            subdir = '/' + subdir
        subdir = '../../input' + subdir + '/'

        file = subdir + file
        
        # open excel file
        with open(file, "rb") as f:
            inmemory_file = io.BytesIO(f.read())
        wb = load_workbook(inmemory_file, read_only = True)
        
        # read data
        r = re.compile(r'^(input\_)*stoich(iometric)*\_coefficients*$')
        st_coeff_data = pd.read_excel(file, sheet_name = list(filter(r.search, wb.sheetnames))[0])
        
        r = re.compile(r'^(input\_)*concentrations*$')
        con_data = pd.read_excel(file, sheet_name = list(filter(r.search, wb.sheetnames))[0], header = 1)
        
        r = re.compile(r'^(input\_)*concentrations*$')
        type_con = pd.read_excel(file, sheet_name = list(filter(r.search, wb.sheetnames))[0]
                                 , header = None, nrows = 1).iloc[0,:]
        
        r = re.compile(r'^(input\_)*k\_constants\_log10$')
        lg_k_data = pd.read_excel(file, sheet_name = list(filter(r.search, wb.sheetnames))[0])
        
        r = re.compile(r'^(particle|component)_names*$')
        component_name_for_yields = pd.read_excel(file, sheet_name = list(filter(r.search, wb.sheetnames))[0]
                                                  , header = None).iat[0, 0]
        
    # use a bunch of plain text files instead
    else:
        raise FileNotFoundError('CSV input not yet implemented')

    return st_coeff_data, lg_k_data, con_data, type_con, component_name_for_yields

## Preprocessing

In [13]:
# basic preprocessing ----------------------------------------
    
def eq_preproc(st_coeff_data, con_data, type_con, lg_k_data, component_name_for_yields):
    
    #checking if there are several series
    
    if 'series' not in con_data.columns:        
        con_data['series'], type_con[np.shape(st_coeff_data)[1]] = '', ''

    # series variables
    
    ser_info = con_data['series'].to_numpy()
    ser_unique = np.unique(ser_info)
    ser_num = np.shape(np.unique(ser_info))[0]

    # matrix of stoich coeff with formal reactions added
    st_coeff_matrix = st_coeff_data.to_numpy()
    formal_matrix = np.eye(np.shape(st_coeff_matrix)[1], dtype = int)
    st_coeff_matrix = np.vstack((formal_matrix, st_coeff_matrix))
    
    # list of products and reagents names for further using in output data
    
    st_coeff_data['prod_names'] = ''
        
    for cl in st_coeff_data.drop('prod_names', axis = 1):
        
        st_coeff_data['prod_names'] = np.where(st_coeff_data[cl] > 0,
                                               st_coeff_data['prod_names'] + '+' + st_coeff_data[cl].apply(str) + cl,
                                               st_coeff_data['prod_names'])
        
        st_coeff_data['prod_names'] = st_coeff_data['prod_names'].replace({r'(\+)1([a-zA-Z])' : r'\1\2'}, regex = True)
        st_coeff_data['prod_names'] = st_coeff_data['prod_names'].replace(to_replace = r'^\+', value = '', regex = True)
        
    # product names lists : full and base components only
    
    prod_names_con = list(con_data.drop('series', axis = 1))
    prod_names = prod_names_con + st_coeff_data['prod_names'].tolist()
    
    # creating the vector of equilibrium constants including the formal reactions
    lg_k = (np.vstack((np.zeros((np.shape(st_coeff_matrix)[1], 1)), lg_k_data.to_numpy())))
    
    # checking the consistency of reagent names in different sheets    
    if prod_names_con != list(st_coeff_data.drop('prod_names', axis = 1)):
        print('Check the consistency of reagent names!')
    
    # split concentrations matrix
    con_matrix = [g for _, g in con_data.groupby(['series'])]
        
    for cnm_index, cnm in enumerate(con_matrix):
        con_matrix[cnm_index] = cnm.drop('series', axis = 1).to_numpy()
    
    ser_counts = con_data.groupby(['series']).size().tolist();
    
    # creating vector of indices of components with predetermined concentrations
    ign_indices = np.array(type_con.index[type_con == 'eq'])
    
    if component_name_for_yields not in prod_names:
        print('The component name for partition should be among those of basis components')
        
    idx, = np.where(component_name_for_yields == np.array(prod_names_con))
        
    return ser_num, st_coeff_matrix, prod_names, lg_k, prod_names_con, con_matrix, ign_indices, idx, ser_counts, ser_info # add returned values

## Main calculations function (Newton method)

In [80]:
def eq_calc(max_iter, eps, component_name_for_yields, ser_num, st_coeff_matrix, prod_names, lg_k,
            prod_names_con, con_matrix, ign_indices, idx, ser_counts, ser_info):
    
    prod_eq_con_matrix = [0] * np.shape(st_coeff_matrix)[0] # create equilibrium concentration matrix
    reag_tot_con_matrix_calc = [0] * np.shape(st_coeff_matrix)[1] # create total concentrations of reagents calculated
    
    jac_matrix = [[0] * np.shape(st_coeff_matrix)[1] for j in range(np.shape(st_coeff_matrix)[1])] # create Jacobi matrix
    error = [0] * np.shape(st_coeff_matrix)[1] # create vector of errors
    
    prev = [0] * np.shape(st_coeff_matrix)[1] # create vector of previous estimation of equilibrium reagents concentrations
    yields = [0] * np.shape(st_coeff_matrix)[0] #create yields vector
    
    conc_size = (np.shape(con_matrix)[0], np.shape(con_matrix)[1])
    
    results_conc = np.zeros(conc_size)
    results_yields = np.zeros(conc_size)
    g_res = np.zeros(np.shape(con_matrix)[0], np.shape(con_matrix)[1])
    
    print(results_conc);
    
    lg_k_copy, st_coeff_matrix_copy, con_matrix_copy = deepcopy(lg_k), deepcopy(st_coeff_matrix), deepcopy(con_matrix)
    
    for s in range(ser_num):
    
        for k in range(np.shape(con_matrix[s])[0]):
        
            # if some equilibrium concentrations are set
            if np.shape(ign_indices)[0] > 0:
            
                lg_k_app, st_coeff_matrix_app, con_matrix_app = lg_k, st_coeff_matrix, con_matrix
            
                for i in range(np.shape(lg_k)[0]):
                    for j in range(np.shape(ign_indices)[0]):
                        lg_k_app[i] = np.array(lg_k[i]) + np.array(st_coeff_matrix[i][ign_indices[j]]) * np.log(con_matrix[s][k][ign_indices[j]]) / math.log(10)
    
                lg_k_app = np.delete(lg_k_app, ign_indices, axis = 0)
            
                con_matrix_app[s] = np.delete(con_matrix_app[s], ign_indices, axis = 1)
                st_coeff_matrix_app = np.delete(st_coeff_matrix_app, ign_indices, axis = 0)
                st_coeff_matrix_app = np.delete(st_coeff_matrix_app, ign_indices, axis = 1)
                
                lg_k, st_coeff_matrix, con_matrix = lg_k_app, st_coeff_matrix_app, con_matrix_app
                
            reag_eq_con_matrix = deepcopy(con_matrix[s][k]) # initial estimation of equilibrium concentrations of reagents 
            
            # start of iterative procedure
            for it in range(max_iter):
        
                # caclulating the equilibrium concentrations of products
                prod_eq_con_matrix = np.exp(np.transpose(np.array(math.log(10) * np.array(lg_k))) + np.dot(st_coeff_matrix, np.log(np.array(reag_eq_con_matrix))))
            
                # calculating the total concentrations of reagents
                reag_tot_con_matrix_calc = np.transpose(np.dot(np.transpose(st_coeff_matrix), np.transpose(prod_eq_con_matrix)))
            
                # calculating the residuals
                g_res[s][k] = np.array(reag_tot_con_matrix_calc) - np.array(con_matrix[s][k])
                            
                # calculating the Jacobi matrices
                jac_matrix = np.dot(np.transpose(st_coeff_matrix), (np.array(st_coeff_matrix) * np.transpose(prod_eq_con_matrix)))
                prod_eq_con_matrix = np.transpose(prod_eq_con_matrix)
            
                # new estimation of equilibrium concentrations of reagents
                prev = np.log(reag_eq_con_matrix)
                reag_eq_con_matrix = np.exp(prev - np.transpose(np.dot(np.linalg.inv(jac_matrix), np.transpose(g_res[s][k]))))
                reag_eq_con_matrix = reag_eq_con_matrix[0]
                error = abs(np.log(reag_eq_con_matrix) - prev)
                
                # checking the convergence
                if np.max(error) < eps:
            
                    # if some equilibrium concentrations are set
                    if np.shape(ign_indices)[0] > 0:
                        lg_k, st_coeff_matrix, con_matrix = deepcopy(lg_k_copy), deepcopy(st_coeff_matrix_copy), deepcopy(con_matrix_copy)
                        for j in range(np.shape(ign_indices)[0]):
                            prod_eq_con_matrix = np.insert(np.transpose(prod_eq_con_matrix)[0], ign_indices[j], con_matrix[s][k][ign_indices[j]])
                            prod_eq_con_matrix = prod_eq_con_matrix.reshape((len(prod_eq_con_matrix), 1))

                    # calculating the yields                
                    for i in range(np.shape(prod_eq_con_matrix)[0]):
                        yields[i] = np.transpose(prod_eq_con_matrix[i]) * st_coeff_matrix[i][idx[0]] * 100 / con_matrix[s][k][idx[0]]
            
                    # it is just a crooked nail aimed to solve the problem with array dimensions. Got no idea why it happens.
                    if len(np.shape(np.array(yields))) > 1:
                        for i in range(len(yields)):
                            yields[i] = float(yields[i])
                        
                    results_conc[s][k] = np.transpose(prod_eq_con_matrix)[0]
                    results_yields[s][k] = np.array(yields)    
                    break
 
        g_res[s] = np.array(g_res[s]).reshape((np.shape(con_matrix[s])[0], np.shape(con_matrix[s])[1] - np.shape(ign_indices)[0]))
    
        # if some equilibrium concentrations are set
        if np.shape(ign_indices)[0] > 0:
            for j in range(np.shape(ign_indices)[0]):
                g_res[s] = np.insert(g_res[s], ign_indices[j], np.zeros((1, np.shape(con_matrix[s])[0]), dtype = int), axis = 1)

    return results_conc, results_yields, g_res

## Postprocessing

In [61]:
def eq_postproc(results_conc, results_yields, g_res, ser_num, ser_info, ser_counts, lg_k_data, con_data, 
                st_coeff_data, con_matrix, prod_names, prod_names_con, component_name_for_yields, type_con):
    
    # making the last preparation to the following section
    c_res_out, c_yie_out, g_res_out = [], [], []
    
    for s in range(ser_num):
        c_res_out += results_conc[s]
        c_yie_out += results_yields[s]
        g_res_out += list(g_res[s])
    
    c_inp_out = con_data.to_numpy()

    prod_names_3 = prod_names_con
    p_comp = (-np.log(c_res_out) / math.log(10))[:, idx[0]]
    
    if 'series' in con_data.columns:
        c_res_out = np.hstack((c_res_out, ser_info.reshape((len(ser_info), 1))))
        c_yie_out = np.hstack((c_yie_out, ser_info.reshape((len(ser_info), 1))))
        g_res_out = np.hstack((g_res_out, ser_info.reshape((len(ser_info), 1))))
        
        prod_names = prod_names + ['series']
        prod_names_con = prod_names_con + ['series']

    y_prod_names = ['p(' + component_name_for_yields + ')'] + prod_names
    y_indexes = [str('S_' + str(i+1)) for i in range(np.shape(con_data)[0])]

    c_yie_out = np.hstack((p_comp.reshape((len(p_comp), 1)), c_yie_out))

    # preparing data for output
    c_res_out = pd.DataFrame(data=np.array(c_res_out), columns = prod_names)
    c_yie_out = pd.DataFrame(data=np.array(c_yie_out), columns = y_prod_names, index = y_indexes)
    c_yie_out = c_yie_out.loc[:, (c_yie_out != 0).any(axis=0)]
    c_inp_out = np.vstack((prod_names_con, c_inp_out))
    c_inp_out = np.vstack((type_con, c_inp_out))
    c_inp_out = pd.DataFrame(data=np.array(c_inp_out))
    g_res_out = pd.DataFrame(data=np.array(g_res_out), columns = prod_names_con)
    comp_name_res = pd.DataFrame(data=np.array(component_name_for_yields).reshape((1, 1)))
    
    return c_inp_out, c_res_out, c_yie_out, g_res_out, comp_name_res 

## Writing the results to excel

In [62]:
def eq_output(sep_out, subdir_out, file_out,  results_stoich_coeff, results_lg_k, c_res_out, c_inp_out,
             c_yie_out, component_name_for_yields, g_res_out, comp_name_res):
       
    if file_out != "":
        
        if subdir_out != '':
            subdir_out = '/' + subdir_out
        subdir_out = '../../output' + subdir_out + '/'
        
    file_out = subdir_out + file_out
    
    # output
    with pd.ExcelWriter(file_out, mode = "w") as output: # specify the path!
        results_stoich_coeff.to_excel(output, sheet_name = 'input_stoich_coefficients', index = False)
        results_lg_k.to_excel(output, sheet_name = 'input_k_constants_log10', index = False)
        c_res_out.to_excel(output, sheet_name = 'input_concentrations', header = None, index = False)
        c_inp_out.to_excel(output, sheet_name = 'equilibrium_concentrations', index = False)
        c_yie_out.to_excel(output, sheet_name = component_name_for_yields + '_fractions', index_label = ['rn'])
        g_res_out.to_excel(output, sheet_name = 'percent_error', index = False)
        comp_name_res.to_excel(output, sheet_name = 'component_names', header = None, index = False)

# Run

In [81]:
# run --------------------------------------------------------

# define variables -----------

_subdir = "concentrations/ds.5p.ser"
_sep = ";"
_file = "big_ser_test.xlsx"
subdir_out = "concentrations/ds.5p.ser"
sep_out = ";"
file_out = "big_ser_test_res.xlsx"
max_iter, eps = 1000, 0.0000001
    
# run loading function ------

st_coeff_data, lg_k_data, con_data, type_con, component_name_for_yields  = eq_scripts_load(sep = _sep, subdir = _subdir, file = _file)

# run preprocessing function ------

ser_num, st_coeff_matrix, prod_names, lg_k, prod_names_con, con_matrix, ign_indices, idx, ser_counts, ser_info = eq_preproc(st_coeff_data, con_data, type_con, lg_k_data, component_name_for_yields)

con_matrix = np.concatenate(con_matrix, axis=0)

# run calculations ------

results_conc, results_yields, g_res = eq_calc(max_iter, eps, component_name_for_yields, ser_num, st_coeff_matrix, prod_names, lg_k, prod_names_con, con_matrix, ign_indices, idx, ser_counts, ser_info)

# run postprocessing subroutine ------

c_inp_out, c_res_out, c_yie_out, g_res_out, comp_name_res = eq_postproc(results_conc, results_yields, g_res, ser_num, ser_info, ser_counts, lg_k_data, con_data, st_coeff_data, con_matrix, prod_names, prod_names_con, component_name_for_yields, type_con)

# run writing to excel subroutine ------

eq_output(sep_out, subdir_out, file_out, st_coeff_data, lg_k_data, c_inp_out, c_res_out, c_yie_out, component_name_for_yields, g_res_out, comp_name_res)

TypeError: data type not understood

In [34]:
print("\n Input Concentrations");
print(c_inp_out);
print("\n Output Concentrations");
print(c_res_out);
print("\n Fractions");
print(c_yie_out);
print("\n ???");
print(g_res_out);
print("\n Component Name");
print(comp_name_res);


 Input Concentrations
         0        1          2          3        4       5
0      tot      tot        tot        tot      tot     NaN
1        H      PO4       Hydr         Cu      DNA  series
2   0.1056   0.0888  0.0001016  9.923e-06    1e-19       a
3   0.1012  0.08851  0.0001016  1.985e-05    1e-19       a
4   0.0968   0.0814  0.0001016  2.977e-05    1e-19     ass
5   0.0924   0.0777  0.0001016  3.969e-05    1e-19     ass
6    0.088    0.074  0.0001016  4.962e-05    1e-19       b
7   0.0836   0.0703  0.0001016  5.954e-05    1e-19       b
8   0.0792   0.0666  0.0001016  6.946e-05    1e-19       b
9   0.0748   0.0629  0.0001016  7.938e-05    1e-19       b
10  0.0704   0.0592  0.0001016  8.931e-05    1e-19       c
11   0.066   0.0555  0.0001016  9.923e-05  8.5e-05       c
12   0.066   0.0555     0.0001      5e-05  8.5e-05       c

 Output Concentrations
              H          PO4         Hydr           Cu          DNA  \
0   5.47193e-08  4.35648e-06  8.25952e-05  1.08121e-09  

In [69]:
print(type(ser_num));
print(ser_num);
print(type(con_matrix));
print(con_matrix);
for p in con_matrix:
    print("\n");
    print(p);
print("\n\n");
print(np.concatenate(con_matrix, axis=0));

<class 'int'>
4
<class 'numpy.ndarray'>
[[1.056e-01 8.880e-02 1.016e-04 9.923e-06 1.000e-19]
 [1.012e-01 8.851e-02 1.016e-04 1.985e-05 1.000e-19]
 [9.680e-02 8.140e-02 1.016e-04 2.977e-05 1.000e-19]
 [9.240e-02 7.770e-02 1.016e-04 3.969e-05 1.000e-19]
 [8.800e-02 7.400e-02 1.016e-04 4.962e-05 1.000e-19]
 [8.360e-02 7.030e-02 1.016e-04 5.954e-05 1.000e-19]
 [7.920e-02 6.660e-02 1.016e-04 6.946e-05 1.000e-19]
 [7.480e-02 6.290e-02 1.016e-04 7.938e-05 1.000e-19]
 [7.040e-02 5.920e-02 1.016e-04 8.931e-05 1.000e-19]
 [6.600e-02 5.550e-02 1.016e-04 9.923e-05 8.500e-05]
 [6.600e-02 5.550e-02 1.000e-04 5.000e-05 8.500e-05]]


[1.056e-01 8.880e-02 1.016e-04 9.923e-06 1.000e-19]


[1.012e-01 8.851e-02 1.016e-04 1.985e-05 1.000e-19]


[9.680e-02 8.140e-02 1.016e-04 2.977e-05 1.000e-19]


[9.240e-02 7.770e-02 1.016e-04 3.969e-05 1.000e-19]


[8.800e-02 7.400e-02 1.016e-04 4.962e-05 1.000e-19]


[8.360e-02 7.030e-02 1.016e-04 5.954e-05 1.000e-19]


[7.920e-02 6.660e-02 1.016e-04 6.946e-05 1.000e-19

In [75]:
print(range(np.shape(con_matrix)[0]))

range(0, 11)


In [79]:
np.zeros((3,2))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])