In [None]:
from IPython.core.display import HTML
display(HTML("<style>.container { width:95% !important; }</style>"))   
%load_ext autoreload
%autoreload 1

In [None]:
from ercollect import molecule as mol
from ercollect.molecule import molecule
from ercollect import rxn_syst
from ercollect.rxn_syst import reaction, get_RS
import numpy as np
import random
from rdkit.Chem import Draw
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from IPython.display import clear_output
from ercollect import SABIO_IO

Author: Andrew Tarzia

Date Created: 15 Nov 2018

Distributed under the terms of the MIT License.

# Notebook to clean up collected SABIO entries
This notebook contains some fixes for problems that occured due to the SABIO database and could not be integrated into the actual code base

# PROBLEM 1:
## 15/11/18
- found that SABIO does not report different UniProt accession numbers for mutants of a sequence.
- as it is not clear which sequence should be used in those cases, I have decided to throw out those reactions

## Step 1:
Determine how many of existing RS have a mutant sequence and set rs.etype attribute

In [None]:
rs_dir = '/home/atarzia/psp/screening_results/new_reactions_sabio_wcharge/'
# rs_dir = '/home/atarzia/psp/screening_results/biomin_search_sabio_wcharge/'

In [None]:
count_mutant = 0
count_total = 0
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    EID = rs.DB_ID
    rID = rs.rID
#     print(EID, rID)
    try:
        print(rs.etype)
        continue
    except AttributeError:
        pass
    result = SABIO_IO.get_rxnID_from_eID(eID=EID)
    count_total += 1
    rs.etype = result[4]
    if 'wildtype' not in result[4]:
        print(rs.DB_ID, rs.pkl)
        print(result[4])
        count_mutant += 1
#     break
    rs.save_object(rs_dir+rs.pkl)

In [None]:
print(count_mutant, count_total, count_mutant/count_total * 100)

## Step 2:
For all RS with 'wildtype' not in rs.etype attribute, set skip_reaction = True and reason

In [None]:
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    if 'wildtype' not in rs.etype:
        print(rs.DB_ID, rs.pkl)
        rs.skip_rxn = True
        rs.skip_reason = 'SABIO E-ID is for mutant'
    rs.save_object(rs_dir+rs.pkl)

# PROBLEM 2:
## 15/11/18
- found inaccuracies in the API for modifiers of reactions:
    - in some cases the wrong ion is reported compared to the website or any ion is reported instead of non compared to the website
- as of 23/11/18, we are removing all 'modifier' compounds

## Step 1:
Determine the molecules present in modifier roles

In [None]:
# rs_dir = '/home/atarzia/psp/screening_results/new_reactions_sabio_wcharge/'
rs_dir = '/home/atarzia/psp/screening_results/biomin_search_sabio_wcharge/'

In [None]:
mod_list = []
mod_names = []
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    if rs.skip_rxn is True:
        continue
    for m in rs.components:
        if 'modifier' in m.role:  # m.role != 'reactant' and m.role != 'product':
            print(rs.DB_ID, rs.pkl, m.name, m.role)
            mod_list.append((rs.DB_ID, rs.pkl, m.name, m.role))
            mod_names.append(m.name)

In [None]:
print(len(list(set(mod_names))))
print(list(set(mod_names)))

## Step 2:
- Remove components from RS if:
    - role has: 'inhibitor', 'activator' or 'unknown'
    - UPDATE (23/11/18): role has 'modifier' in it.
- save RS  

In [None]:
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    if rs.skip_rxn is True:
        continue
    new_comp = []
    for m in rs.components:
#         if m.role != 'reactant' and m.role != 'product' and 'cofactor' not in m.role:
        if 'modifier' in m.role:
            print(rs.DB_ID, rs.pkl, m.name, m.role)
            continue
        else:
            new_comp.append(m)
    rs.components = new_comp
    rs.save_object(rs_dir+rs.pkl)

## Step 3:
- Set all 
    - rs.max_comp_size = None
    - rs.all_fit = None
    - rs.delta_comp
    - rs.delta_sa = None
    - rs.max_XlogP = None
    - rs.min_XlogP = None
    - rs.max_logP = None
    - rs.min_logP = None
    - rs.p_max_comp = None
    - rs.r_max_comp = None
    - rs.p_max_sa = None
    - rs.r_max_sa = None
- rerun rxn_syst.py analysis (external)

In [None]:
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    rs.max_comp_size = None
    rs.all_fit = None
    rs.delta_comp = None
    rs.delta_sa = None
    rs.max_logP = None
    rs.max_XlogP = None
    rs.min_logP = None
    rs.min_XlogP = None
    rs.p_max_comp = None
    rs.p_max_sa = None
    rs.r_max_comp = None
    rs.r_max_sa = None
    rs.save_object(rs_dir+rs.pkl)

## Step 4:
- Delete all reaction files with skip_rxn = True
    - this was done because I could not guarantee that the skips were caused by 'modifiers' or not
- rerun rxn_syst.py collection (external)

In [None]:
import os

In [None]:
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    if rs.skip_rxn is True:
        print(rs.pkl)
        os.system('rm '+rs_dir+rs.pkl)

# PROBLEM 3:
## 24/11/18
- Needed to manually setup the oxidized ABTS molecule as the conversion from InChi to RDKit does not behave well
- Having done that to the molecule pkl file, I now need to replace this component with the new pkl for all RS that contain it
    - the simplest way to do this would be to find all RS with this component and set the attributes of this component to None and rs.mol_collected to False so that when the analysis is done, the molecule is replaced

## Step 1:
Find all RS with this component

In [None]:
rs_dir = '/home/atarzia/psp/screening_results/new_reactions_sabio_wcharge/'
# rs_dir = '/home/atarzia/psp/screening_results/biomin_search_sabio_wcharge/'
# rs_dir = '/home/atarzia/psp/screening_results/new_reactions_kegg_atlas/'

In [None]:
pkls_to_mod = []
for rs in rxn_syst.yield_rxn_syst(output_dir=rs_dir):
    if rs.components is None:
        continue
    for m in rs.components:
        if m.name == "Oxidized 2,2'-azino-bis(3-ethylbenzthiazoline-6-sulfonic acid)" or m.pkl == "/home/atarzia/psp/molecule_DBs/atarzia/ATRS_5399.gpkl":
            print(m.name, '------', m.pkl)
            pkls_to_mod.append(rs.pkl)
            break

In [None]:
pkls_to_mod

## Step 2:
For all RS collected:
- replace the target component with the one manually updated from the molecule DB using the existing search function

In [None]:
from ercollect.molecule import molecule, read_molecule_lookup_file

In [None]:
pkls_done = []

In [None]:
N = len(pkls_to_mod)
for i, PKL in enumerate(pkls_to_mod):
    if PKL in pkls_done:
        continue
    print(i, '======', N)
    rs = get_RS(rs_dir+PKL, output_dir=rs_dir, verbose=True)
    for m in rs.components:
        if m.name == "Oxidized 2,2'-azino-bis(3-ethylbenzthiazoline-6-sulfonic acid)":
            print(m.SMILES, m.InChi)
            m.SMILES = None
            m.InChi = None
            lookup_file = '/home/atarzia/psp/molecule_DBs/atarzia/lookup.txt'
            molecule_dataset = read_molecule_lookup_file(lookup_file=lookup_file)
            new_m = m.get_compound(dataset=molecule_dataset)
            print(new_m.SMILES, new_m.InChi)
            m = new_m
    rs.save_object(rs_dir+rs.pkl)
    pkls_done.append(PKL)