#### GAT model using the electron transition density atomic contribution matrix
$$
\huge \tilde{\gamma}^{[l,m]}_{AA^{\prime}}
$$

###### Check out the paper of this work <br>
##### *X-ray absorption spectroscopy reveals charge transfer in π-stacked aromatic amino acids*:<br> https://doi.org/10.1039/D4CP04615C

____________________________

#### Functions for processing data

In [None]:
def load_dict_data(filename, index_col_condition='num-1'):
    """
    Reads from a two-column file the hash and the file and
     stores the information of the csv file into a dictionary.
    Format file to be read:
     f00,(path)../newFY_1-26/resA_MOcore_AB_2.5A_1-26.csv
     f01,(path)../newFY_1-26/resA_MOcore_AB_2.6A_1-26.csv
     f02,(path)../newFY_1-26/resA_MOcore_AB_2.7A_1-26.csv
     f03,(path)../newFY_1-26/resA_MOcore_AB_2.8A_1-26.csv
      ...
     f61,(path)../newFY_1-26/resA_MOcore_AB_11.0A_1-26.csv 
    Args:
     filename (str): two-column file filename having
      a hash for each path file.
     index_col_condition (str): the name of the index 
      column to read the csv as pandas structure.
      Default as 'num-1'
    Output (dict): keys are the hashes and values of the dictionary
     are the pandas frame for each csv file.
    """
    # Initialize a dictionary
    dict_raw = {}
    # Open the list file
    with open(filename, 'r') as file:
        # Read each line in the file
        for line in file:
            line = line.strip()
    # {(key) hash = line.split(',')[0] : (value) file_n = line.split(',')[1]}
            dict_raw.update({
                line.split(',')[0]: # key
                pd.read_csv(line.split(',')[1],
                            delimiter=',',
                            index_col=index_col_condition) # value
            })
    return dict_raw

In [None]:
def save_ETDAC_matrix(data_dict, data_set_name="data_etdac_matrix.h5"):
    """
    Get the node/edge features for each molecule and save
    all the results in H5PY format.
    Args:
    data_dict (dict) contains a hash (key) and 
     the ETDAC matrix (value) of each molecule.
    data_set_name (str, optional) is the name of the H5PY file to be
     created. By default that file is called "data_etdac_matrix.h5".
    """
    
    with h5py.File(data_set_name, 'w') as f:

        # Get node/edge features for the list of molecules
        for hash in data_dict.keys():
#df.to_hdf('data.h5', key='df', mode='w', format='table')
            qm_group = f.create_group(f"sample_{hash}")
            qm_group.create_dataset("ETDAC_matrix", data=data_dict[hash], compression="gzip")      
            qm_group.attrs["hash"] = hash

#### Functions for the algebra operations

In [None]:
def selecting_atm_matrix(df, atoms_list):
    """
    it returns MO matrix (df) having just the atoms in atoms_list.
    """
    col_list = df.index.tolist() #listing index (elements from column 0)
    #dff = df.loc[:, (df != 0).any(axis=0)] #removing zero columns
    return df.loc[[i for i in atoms_list if i in col_list]] #returning just columns in the both lists intersection
#atoms_core=resa_mocore_GFG9Y_raw.index.tolist() #all C-core atoms
#[i for i in atoms_a if i in atoms_core] #atoms_a intersection atoms_core

In [None]:
def crop_by_loewdin_p(df,pop):
    """
    it returns MO matrix (df) having just a Loewdin MO population contribution greater than (100*pop)%.
    pop is a number between 0 and 1.
    """
    #cols = [col for col, val in df.sum()[2:].iteritems() if val > (pop*100)]
    cols = [col for col, val in df.sum()[2:].items() if val > (pop*100)]
    #iteritem() create a zip with the index_col and the values
    cols.insert(0,'lvl')
    cols.insert(0,'sym')
    #to add on the top the first two columns removed in the previous sum() evaluation
    return df[cols] #showing the fisrt two columns and those having population greater than pop*100%

In [None]:
def nonzero_mo_matrix(df):
    """
    it returns MO matrix (df) having just non-zero MO population.
    """
    return df.loc[:, (df != 0).any(axis=0)] #removing zero columns

In [None]:
def cropping_matrix(df, df1, df2):
    """
    it returns a cropped MO matrix from df using as parameters the indices in df1 and df2.
    df1 index are column names from df and df2 index are the rows from df
    """
    try:
        dff = df[df1.axes[1][2:].tolist()] #saving specific columns
    except KeyError:
        tmp_mo = [item for item in df1.axes[1][2:].tolist() if item in df.axes[1][0:].tolist()]
        #removing items in the first list that are not in the second one
        dff = df[tmp_mo]
    try:
        return dff.loc[[int(i) for i in df2.axes[1][2:].tolist()]] #returning specific rows
    except KeyError:
        tmp_mo2 = [item for item in df2.axes[1][2:].tolist() if int(item) in dff.T.axes[1][0:].tolist()]
        # dff.T to obtain the index_col as the head row
        return dff.loc[[int(i) for i in tmp_mo2]] #returning specific rows
# the first two elements in df2.axes[1] and df1.axes[1] are "sym" and "lvl", that's why I used df.axes[1][2:]

In [None]:
def remove_noncontrb(dict_data_raw):
    """
    Drops off the non-contributing elements to avoid
    zero or nan spread on the following linear algebra 
    operations.
    This function depends on the nonzero_mo_matrix()
     to work.
    Args:
    dict_data_raw (dict): the pd.frames inside can have zeroes
     or nan values.
    Output (dict): dict_data only with nonzero elements.
    """
    dict_data = {}
    for key, value in dict_data_raw.items():
        dict_data.update({key: nonzero_mo_matrix(value)})
    return dict_data

#### Main functions: ETDAC matrix calculation

##### Building heatmaps of $\tilde{\gamma}^{[l,m]}_{AA^{\prime}}$

In [None]:
def ts_psb_acore_bvirt(acore, bvirt, abcorevirt, atm_to_virtmo=False):
    """
    Do a matrix multiplication between the core-to-virt transition 
    probabilities matrix and the core MO matrix, then the resulting matrix
    is multiplied by the virtual MO matrix.
    if atm_to_virtmo is True, then the two resulting matrices in the 
    multiplication process are returned.
    By the default, just the last matrix is returned.
    """
    abcorevirt.index = abcorevirt.index.astype('str') 
    #to make possible the dot product in pandas, the indexes involved in each
    #product have to "match" in name and type
    
    i = j = 0
    dff = pd.DataFrame(np.zeros((acore.shape[0], abcorevirt.shape[0])), \
                       index=acore.index, columns=abcorevirt.index)
    for j in range(abcorevirt.shape[1]):
        for i in range(acore.shape[1]):
            try:
                dff += acore.T[i:i+1].T.dot(abcorevirt.iloc[:].T[j:j+1])
            except:
                pass
            
    dff = dff/100
    i = j = 0
    dff2 = pd.DataFrame(np.zeros((dff.shape[0], bvirt.shape[0])), \
                        index=dff.index, columns=bvirt.index)
    for i in range(dff.shape[1]):
        for j in range(bvirt.shape[1]):
            try:
                dff2 += dff.T[i:i+1].T.dot(bvirt.T[j:j+1])
            except:
                pass
    
    if atm_to_virtmo:
        return dff, dff2
    else:
        return dff2

In [None]:
def heatmap_ETDAC(core_MO, virt_MO, fosce_mo_trans):
    """
    Runs the ts_psb_acore_bvirt() function to calculate the
     electron transition density atomic contribution matrix 
     by performing some matrix transformations in the 
     core_MO and virt_MO pd.frames that are stored as
     values in dictionary.
    Args:
    core_MO (dict): core MO and atom population matrices 
     obtained by load_dict_data() and remove_noncontrb()
    virt_MO (dict): virtual MO and atom population matrices 
     obtained by load_dict_data()
    fosce_mo_trans (dict): electronic transition
     (oscillator strength) MO matrices obtained by the pipeline
     (github.com/caraortizmah/x-ray_scripting_out) and 
     formated by load_dict_data()
    Output:
    heatmap_raw (dict): The electron transition density 
     atomic contribution (ETDAC) matrix in pd.frame format.
    """
    # WSM case
    # Exploiting the fact that all data share same order of the keys (hashes)
    heatmap_raw = {}
    for key in virt_MO.keys(): 
        # it can be any of the created dictionaries, they have same keys and in the same order
        heatmap_raw.update({
            key:
            ts_psb_acore_bvirt(
                core_MO[key].T[2:].T,
                virt_MO[key].T[2:].T,
                fosce_mo_trans[key])
        })
    return heatmap_raw

In [None]:
def crop_heatmap_byatm(etdac_m, row_cond, col_cond):
    """
    Crop the electron transition density atomic 
     contribution matrix by using a range of atoms
     that are in the core space and in the virtual
     space.
    Args:
    etdac_m (pd.frame): Electron transition density atomic 
     contribution matrix.
    row_cond (lambda): if-statement using the range of 
     atoms of the core space.
    col_cond (lambda): if-statement using the range of
     atoms of the virtual space.
    Outputs:
     etdac_m cropped by using two lambda functions.
    """
    return etdac_m.loc[
    [i for i in etdac_m.index.values if row_cond(i)],
    [i for i in etdac_m.columns.values if col_cond(i)]
    ]

______________________________________________

In [None]:
#Data in .csv format

In [None]:
# resA core MO - $$\boldsymbol{\text{N}}_{A,i} $$
resa_mocore_raw = load_dict_data('resA_MOcore_list.log', 'num-1')

In [None]:
# resB virt MO - $$\boldsymbol{\text{N}}_{A^{\prime},a} $$
resb_movirt_raw = load_dict_data('resB_MOvirt_list.log', 'num-1')

In [None]:
# Transition density state matrix $$ \rho^{n}_{ia} $$

In [None]:
# Transition density state matrix (core/virt MO) 
#  as function of the number of transitions
corevirtMO_raw = load_dict_data('corevirtMO_list.log', 'virt\core')

In [None]:
# Transition density state matrix (core/virt MO) 
#  as function of the transitions averaged probability
corevirtMO_tspb_raw = load_dict_data('corevirtMO_tspb_list.log', 'virt\core')

In [None]:
# Transition density state matrix (core/virt MO) 
#  as function of the oscillator strength using electronic dipole moment as operator
#  only presented by weighted (WSM): fosce
#  $$ \gamma^{[l,m]}_{ia} $$
corevirtMO_fosce_raw = load_dict_data('corevirt_fosce_AB_list.log', 'virt\core')

In [None]:
# Reducing Löwdin Population MO matrices by removing non-contributing MO

In [None]:
#  Case for core MO in the resA 
resa_mocore = remove_noncontrb(resa_mocore_raw)

In [None]:
# Case for virtual MO in the resB
resb_movirt = remove_noncontrb(resb_movirt_raw)

___________________________

### Results

In [None]:
"""
Final calculation: $$ \huge \tilde{\gamma}^{[l,m]}_{AA^{\prime}} $$
"""

In [None]:
# WSM case, it is the official representation by the domain-knowledge
heatmap_raw = heatmap_ETDAC(resa_mocore, resb_movirt_raw, corevirtMO_fosce_raw)

In [None]:
for key in heatmap_raw.keys():
    heatmap_raw[key].index.names = ['core-atom']
    heatmap_raw[key].columns.names = ['virtual-atom']

In [None]:
heatmap = {}
for key in heatmap_raw.keys():
    heatmap.update({
        key:
        heatmap_raw[key].apply(pd.to_numeric).sort_index(ascending=True)
    })

In [None]:
# Saving data
save_ETDAC_matrix(heatmap, data_set_name="data_etdac_matrix.h5")

In [1]:
import h5py

In [10]:
import numpy as np
import pandas as pd

In [3]:
## Technical functions

In [4]:
def load_etdac_h5py(h5file, hash_mol):
    """
    Load data from one ETDAC matrix (group) from a h5py
    file (h5file) using the specific hash (hash_mol).
    """
    # Load a specific molecule using the hash
    id_mol = f"sample_{hash_mol}"
    with h5py.File(h5file, 'r') as f:
        # Load global parameters
        global_params = dict(f.attrs)
        print("Global params:", global_params)

        # Access a specific sample using the hash
        sample = f[id_mol]
        df = pd.DataFrame.from_records(sample["ETDAC_matrix"][:])
        # Restore row name
        df.set_index(sample.attrs["index_name"], inplace=True)
        # Restore column name
        df.columns.name = sample.attrs["column_name"]
        # load arrays
        return [
            sample.attrs["hash"],
            df
        ]

In [None]:
def save_ETDAC_matrix(data_dict, data_set_name="data_etdac_matrix.h5"):
    """
    Get the node/edge features for each molecule and save
    all the results in H5PY format.
    Args:
    data_dict (dict) contains a hash (key) and 
     the ETDAC matrix (value) of each molecule.
    data_set_name (str, optional) is the name of the H5PY file to be
     created. By default that file is called "data_etdac_matrix.h5".
    """
    
    with h5py.File(data_set_name, 'w') as f:

        # Get node/edge features for the list of molecules
        for hash in data_dict.keys():
#df.to_hdf('data.h5', key='df', mode='w', format='table')
            qm_group = f.create_group(f"sample_{hash}")
            qm_group.create_dataset("ETDAC_matrix", 
                                    data=data_dict[hash].to_records(index=True),  # Preserves index+columns
                                    compression="gzip")
            
            qm_group.attrs["hash"] = hash
            qm_group.attrs["column_name"] = data_dict[hash].columns.name # Save column name
            qm_group.attrs["index_name"] = data_dict[hash].index.name  # Save row name

In [11]:
def load_conditions(conditions_file):
    """
    Load conditions, those are pre-defined by the user
    in an external file and return a dictionary having this
    information.
    """
    parameters= {}
    with open(conditions_file, 'r', encoding="utf-8") as params:
        for param in params:
            param = param.strip()
            parameters[param.split(':')[0].strip()] = eval(param.split(':')[1].strip())
    return parameters

In [12]:
def load_molecules(file_list):
    """
    Return a numpy array having the hash
    and the pdb file name of a pdb list file.
    """
    pdbs_data = []
    with open(file_list, 'r', encoding="utf-8") as listpdb:
        for pdb_file in listpdb:
            pdb_file = pdb_file.strip()
            pdb_file = np.array(pdb_file.split(','))
            pdbs_data.append(pdb_file)
    return np.array(pdbs_data, dtype=h5py.string_dtype(encoding='utf-8'))

In [None]:


import csv as csv
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
etdac_data = load_etdac_h5py('data_etdac_matrix.h5', 'f05')

Global params: {}


In [13]:
# Load the parameters that will be used in the node feature extraction
parameters = load_conditions('conditions.txt')

In [14]:
# Load the hash and molecule name that will be used in the node/edge feature extraction
file_list = "list_100pdb_files.txt"
molecules_labels = load_molecules(file_list)

In [None]:
# Save all the molecule list in H5PY format
h5file = "data_100mol_test.h5"
save_nodesnedges_h5py(molecules_labels, parameters, data_set_name=h5file)

In [None]:
def save_nodesnedges_h5py(molecules_info, mol_params, data_set_name="data_metadata.h5"):
    """
    Get the node/edge features for each molecule and save
    all the results in H5PY format.
    Args:
    molecules_info (np.array) contains a hash and 
    the name of each molecule.
    mol_params (dictionary) are the global parameters
    for that set of molecules in molecules_info.
    data_set_name (str, optional) is the name of the H5PY file to be
    created. By default that file is called "data_metadata.h5".
    """
    
    with h5py.File(data_set_name, 'w') as f:
        # Save the global parameters as attributes once for thw file
        for key, value in mol_params.items():
            f.attrs[key] = value

        # Get node/edge features for the list of molecules
        for molecule in molecules_info:

            # Get array 1: node features
            node_feats = np.array(
                featuring_pi_stack(
                    molecule[1],
                    mol_params['aa_1'],
                    mol_params['aa_2'],
                    mol_params['backbone_1'],
                    mol_params['backbone_2'],
                    mol_params['sp2_1'],
                    mol_params['sp2_2'],
                    mol_params['cols'])
            )
            # Load structural information
            atoms = read(molecule[1], format="proteindatabank")
            # Get array 2: edge list
            edge_list, pairwise_feat = default_edge_feature(atoms)

            # Load molecule using RdKit
            mol = Chem.MolFromPDBFile(molecule[1], removeHs=False)
            # Get array 3: edge features
            edge_features = stack_categorical_edge_feats(mol, pairwise_feat, edge_list)
            
            # Create a group for the sample molecule
            mol_group = f.create_group(f"sample_{molecule[0]}")
            
            # Save arrays (with compression to reduce file size)
            mol_group.create_dataset("Node_features", data=node_feats, compression="gzip")
            mol_group.create_dataset("Edge_list", data=edge_list, compression="gzip")
            mol_group.create_dataset("Edge_features", data=edge_features, compression="gzip")
            
            # Save individual attributes
            mol_group.attrs["name"] = molecule[1]
            mol_group.attrs["hash"] = molecule[0]

In [None]:
def featuring_pi_stack(pdb_file, aa1_code, aa2_code, feat1_aa1, \
                       feat1_aa2, feat2_aa1, feat2_aa2, cols):
    """
    Extract node features, strictily based on the geometrical
     representation of the molecule.
    It uses for each amino acid two list of atom positions 
     of two type of features customized by the user necessities.
    Args:
    pdb_file is the pdb file
    aa1_code and aa2_code are 3-letter code strings representing
     the name of the target amino acid, e.g.: 'PHE', 'GLY'...
    feat1_aa1 and feat1_aa2 are atom positions of the first type
     of feature defined by user, e.g.: backbone position atoms.
    feat2_aa1 and feat2_aa2 are atom positions of the second type
     of feature defined by user, e.g.: sp2 position atoms.
    cols is an array with the name of the columns of each pdb_file
     since this part of code use pandas.
    """
    pandas_mol = pandas_molecule(pdb_file)
    node_feats = cross_feat_pi_stack(
        hybridtype_categorical_feat(
            chemgroup_categorical_feat(
                res_categorical_feat(
                    atom_categorical_feat(
                        pandas_mol
                    )
                ),
                aa1_code, aa2_code, feat1_aa1, feat1_aa2),
            aa1_code, aa2_code, feat2_aa1, feat2_aa2)
    )
    return drop_off_corecolumns(node_feats, cols)

In [None]:
with h5py.File(data_set_name, 'w') as f:
        # Save the global parameters as attributes once for thw file
        for key, value in mol_params.items():
            f.attrs[key] = value

        # Get node/edge features for the list of molecules
        for molecule in molecules_info:

            # Get array 1: node features
            node_feats = np.array(
                featuring_pi_stack(
                    molecule[1],
                    mol_params['aa_1'],
                    mol_params['aa_2'],
                    mol_params['backbone_1'],
                    mol_params['backbone_2'],
                    mol_params['sp2_1'],
                    mol_params['sp2_2'],
                    mol_params['cols'])
            )
            # Load structural information
            atoms = read(molecule[1], format="proteindatabank")
            # Get array 2: edge list
            edge_list, pairwise_feat = default_edge_feature(atoms)

            # Load molecule using RdKit
            mol = Chem.MolFromPDBFile(molecule[1], removeHs=False)
            # Get array 3: edge features
            edge_features = stack_categorical_edge_feats(mol, pairwise_feat, edge_list)
            
            # Create a group for the sample molecule
            mol_group = f.create_group(f"sample_{molecule[0]}")
            
            # Save arrays (with compression to reduce file size)
            mol_group.create_dataset("Node_features", data=node_feats, compression="gzip")
            mol_group.create_dataset("Edge_list", data=edge_list, compression="gzip")
            mol_group.create_dataset("Edge_features", data=edge_features, compression="gzip")
            
            # Save individual attributes
            mol_group.attrs["name"] = molecule[1]
            mol_group.attrs["hash"] = molecule[0]

In [None]:
## test purposes
rel_list = []
tmp_keys = [key for key in heatmap.keys()][10:58] #from 3.5 to 9.0 A
for ii in tmp_keys: 
    rel_list.append(heatmap[ii].max().max())
relmax = max(rel_list)
relmin = min(rel_list)
relmin, relmax

##### Plot electron transition density atomic contribution (ETDAC) matrices

In [None]:
fig, ax = plt.subplots(figsize=(17,10)) 
test = (heatmap['f13'] - heatmap['f13'].min().min())/(heatmap['f13'].max().max() - heatmap['f13'].min().min())
sns.heatmap(test, annot=False, cmap='Oranges', vmin=0, vmax=1, ax=ax)

#### Definition of amino acids using range of atoms
##### Example is the same Phe --- Tyr

##### Set atoms

In [None]:
atomAi = 0
atomAf = 22
atomBi = 23
atomBf = 46

In [None]:
atomAi, atomAf, atomBi, atomBf

#### Delitimation of atoms of the aromatic rings

In [None]:
atomFi = 6
atomFf = 11
atomYi = 29
atomYf = 33
atomYf2 = 35

#### Calculating the 4 (more) terms of the transition intensities

In [None]:
inter_fosce = []
all_fosce = []
FYpi_inter_fosce = []
YFpi_inter_fosce = []
aropi_inter_fosce = []
keys = []

for key in heatmap.keys():
    keys.append(key)
    row_cond_1, col_cond_1 = lambda i: i > atomAf, lambda i: i < atomBi
    row_cond_2, col_cond_2 = lambda i: i <= atomAf, lambda i: i >= atomBi
    inter_fosce.append(
        crop_heatmap_byatm(heatmap[key], row_cond_1, col_cond_1).sum().sum() +
        crop_heatmap_byatm(heatmap[key], row_cond_2, col_cond_2).sum().sum()
    )
    row_cond_1 = lambda i: i >= atomFi and i <= atomFf
    col_cond_1 = lambda i: (i >= atomYi and i <= atomYf) or i == atomYf2
    FYpi_inter_fosce.append(
        crop_heatmap_byatm(heatmap[key], row_cond_1, col_cond_1).sum().sum()
        )
    row_cond_1 = lambda i: (i >= atomYi and i <= atomYf) or i == atomYf2
    col_cond_1 = lambda i: i >= atomFi and i <= atomFf
    YFpi_inter_fosce.append(
        crop_heatmap_byatm(heatmap[key], row_cond_1, col_cond_1).sum().sum()
        )
    
    aropi_inter_fosce = [FYpi_inter_fosce[i] + YFpi_inter_fosce[i] for i in range(len(inter_fosce))]
    
    all_fosce.append(heatmap[key].sum().sum())
    
intra_fosce = [all_fosce[i] - inter_fosce[i] for i in range(len(inter_fosce))]

In [None]:
max(inter_fosce), max(intra_fosce), max(all_fosce)

In [None]:
dfftotal_fosce = pd.DataFrame({'hash': [i for i in keys],\
                               'inter_fosce': [i for i in inter_fosce],\
                               'intra_fosce': [i for i in intra_fosce],\
                               'all_fosce': [i for i in all_fosce],\
                               'FY_pi':[i/max(inter_fosce) for i in FYpi_inter_fosce],\
                               'YF_pi':[i/max(inter_fosce) for i in YFpi_inter_fosce],\
                               'pi_pi':[i/max(inter_fosce) for i in aropi_inter_fosce],\
                               'abs_pi_pi':[i for i in aropi_inter_fosce]
                              })

In [None]:
dfftotal_fosce

#### Data to be saved

In [None]:
plt.rc('font', size=26)
ax = dfftotal_fosce.loc[10:55,:].plot(
    x="hash",
    y=["inter_fosce","intra_fosce","abs_pi_pi", "all_fosce"],
    kind="line",
    figsize=(16, 12))
ax.set_xlabel('Sample unique identificator')
ax.set_ylabel('Transition intensity')

plt.show()

#### That's it :)