In [184]:
import pandas as pd
import oracledb
import xml.etree.ElementTree as ET
import xmltodict
from deepdiff import DeepDiff
import json

In [185]:
un = 'interpro'
cs = 'ora-dlvm-119.ebi.ac.uk:1521/IPREAD'
pw = "olymp"

db = oracledb.connect(user=un, password=pw, dsn=cs)
cursor = db.cursor()

In [186]:
# Parse the XML file
tree = ET.parse("match_test.xml")
root = tree.getroot()

# Find all <lcn> elements and remove the 'representative' attribute if present
for lcn in root.findall('.//lcn'):
    if 'representative' in lcn.attrib:
        del lcn.attrib['representative']

# Find all <ipr> elements and remove them
# Iterate over a copy of the list of <ipr> elements to avoid modifying the list while iterating
for ipr in root.findall('.//ipr'):
    # Get the parent of <ipr> by iterating over the tree
    for parent in root.iter():
        if ipr in parent:
            parent.remove(ipr)
            break  # Exit the loop once the <ipr> element is removed
        
# Write the modified tree back to the XML file (or a new file)
tree.write("processed.xml", encoding="utf-8", xml_declaration=True)

In [187]:
tree = ET.parse("processed.xml") 
root = tree.getroot()

# Extract protein IDs
protein_ids = [protein.get("id") for protein in root.findall('protein')]
proteins_sql = ["\'" + protein  + "\'" for protein in protein_ids]

In [188]:
len(protein_ids)

996

In [189]:
sql = f"""

WITH 

proteins AS (
    SELECT PROTEIN_AC, NAME, DBCODE, CRC64, LEN,
           TO_CHAR(TIMESTAMP, 'YYYY-MM-DD') AS TIMESTAMP,
           FRAGMENT, TO_CHAR(TAX_ID) AS TAX_ID
    FROM INTERPRO.PROTEIN
    WHERE PROTEIN_AC IN ({','.join(proteins_sql)})
    ORDER BY PROTEIN_AC
),

matches AS (
    -- Limit the number of rows from the MATCH and FEATURE_MATCH tables
    SELECT PROTEIN_AC, METHOD_AC, MODEL_AC, POS_FROM, POS_TO, FRAGMENTS, SCORE, DBCODE, EVIDENCE, STATUS
    FROM INTERPRO.MATCH
)

SELECT P.PROTEIN_AC, P.NAME, M.DBCODE, P.CRC64, P.LEN, P.TIMESTAMP, P.FRAGMENT, P.TAX_ID,
       M.METHOD_AC, M.MODEL_AC, M.POS_FROM, M.POS_TO, M.FRAGMENTS, M.SCORE, MN.DESCRIPTION, M.STATUS,
       DB.DBSHORT, CE.ABBREV, CT.ABBREV

FROM proteins P
LEFT OUTER JOIN matches M
ON P.PROTEIN_AC = M.PROTEIN_AC

LEFT OUTER JOIN INTERPRO.METHOD MN
ON M.METHOD_AC = MN.METHOD_AC

LEFT OUTER JOIN INTERPRO.CV_DATABASE DB
ON M.DBCODE = DB.DBCODE

LEFT OUTER JOIN CV_EVIDENCE CE
ON M.EVIDENCE = CE.CODE

LEFT OUTER JOIN CV_ENTRY_TYPE CT
ON MN.SIG_TYPE = CT.CODE

ORDER BY P.PROTEIN_AC

"""


In [190]:
protein_data = cursor.execute(sql)
protein_data = [[str(value) if value is not None else '' for value in row] for row in protein_data]

In [191]:
len(protein_data)

6124

In [192]:
def create_xml(protein_data):

    # Convert the protein_data list into a pandas DataFrame
    df = pd.DataFrame(protein_data, columns=[
        'protein_id', 'name', 'dbcode', 'crc64', 'length', 'timestamp', 'fragment', 'tax_id',
        'method_ac', 'model_ac', 'pos_from', 'pos_to', 'fragments', 'score', 'method_desc', 'status', 'dbname', 'evd', 'sig_type'
    ])
    
    # Group by protein_id and method_ac, then create a nested dictionary
    grouped = {}

    # Iterate over each row and populate the nested dictionary
    for _, row in df.iterrows():

        protein_id = row['protein_id']

        if (not(protein_id in grouped.keys())):
            grouped[protein_id] = {
                "info": {
                "id": row["protein_id"],
                "name": row["name"],
                "length": row["length"],
                "crc64": row["crc64"],
                }
        }
            
    
        match_id = row['method_ac']
        match_id = match_id + "||" + row["model_ac"] if row["model_ac"] else match_id
        location = {
            'start': row['pos_from'],
            'end': row['pos_to'],
            'fragments': row['fragments'],
            'score': int(float(row["score"])) if row["score"][-2:] == ".0" else row["score"]
        }

        if (match_id != ""):
            # Add the match_id and its location under the protein_id
            if (not(match_id in grouped[protein_id].keys())):
                grouped[protein_id][match_id] = {
                    "id": match_id,
                    "name": row["method_desc"],
                    "dbname": row["dbname"],
                    "status": row["status"],
                    "model": row["model_ac"],
                    "evd": row["evd"], 
                    "type": row["sig_type"],
                    "locations": [location]
                }
            else:
                grouped[protein_id][match_id]["locations"].append(location)

    # Create the root element for XML
    root = ET.Element("proteins")

    # Iterate through the grouped data to create XML structure
    for protein_id, protein_data in grouped.items():
        # Extract the info for the protein
        info = protein_data["info"]
        
        # Create a protein element
        protein_elem = ET.SubElement(root, "protein", 
                                     id=info["id"], 
                                     name=info["name"], 
                                     length=str(info["length"]), 
                                     crc64=info["crc64"])
        
        # Iterate over matches under this protein
        for match_id, match_data in protein_data.items():
            if match_id == "info":
                continue  # Skip the info entry
            
            # Create a match element under the protein
            match_elem = ET.SubElement(protein_elem, "match", 
                                       id=match_data["id"].split("||")[0], 
                                       name=match_data["name"], 
                                       dbname=match_data["dbname"],
                                       status=match_data["status"], 
                                       model=match_data["model"],
                                       type=match_data["type"],
                                       evd=match_data["evd"])
            
            print(match_elem.attrib)
            # Create lcn elements for each location in the match
            for loc in match_data["locations"]:
                
                frag_str = ""

                if (loc["fragments"]):
                    frag_list = sorted(loc['fragments'].split(","), key=lambda x: (x.split("-")[2]))
                    frag_list = sorted(loc['fragments'].split(","), key=lambda x: (int(x.split("-")[0])))
                    frag_str = ','.join(frag_list)
                else:
                     frag_str = '-'.join([loc["start"], loc["end"], "S"])

                lcn_elem = ET.SubElement(match_elem, "lcn", 
                                         start=str(loc['start']), 
                                         end=str(loc['end']),
                                         fragments=frag_str,
                                         score=str(loc['score']))
    
    # Create an XML tree and write it to a file
    tree = ET.ElementTree(root)
    tree.write("output.xml", encoding="utf-8", xml_declaration=True)

    return grouped

def standardize():

    # Load the XML file
    tree = ET.parse("output.xml")
    root = tree.getroot()

    # Iterate over each protein and sort its matches
    for protein in root.findall('protein'):
        # Get all match elements
        matches = list(protein.findall('match'))

        # Clear the original match elements from the protein
        for match in matches:

            locations = list(match.findall('lcn'))
            sorted_lcsn = sorted(locations, key=lambda x: int(x.get('start')))

            # Clear the original match elements from the protein
            for lcn in locations:
                match.remove(lcn)

            # Append the sorted match elements back to the protein
            for lcn in sorted_lcsn:
                match.append(lcn)


        # Sort matches by the 'score' attribute (convert score to integer for sorting)

        matches = sorted(matches, key=lambda x: int(x.find("lcn").get("start")))
        sorted_matches = sorted(matches, key=lambda x: x.get('id'))

        for match in matches:
            protein.remove(match)
        
        # Append the sorted match elements back to the protein
        for match in sorted_matches:
            protein.append(match)


    # Save the modified XML back to a file
    tree.write("output.xml", encoding='utf-8', xml_declaration=True)

def get_differences(new_path, original_path):
    new = xmltodict.parse(open(new_path, "r").read(), attr_prefix = "", process_namespaces=True)
    original = xmltodict.parse(open(original_path, "r").read(), attr_prefix = "", process_namespaces=True)
    return new, original, DeepDiff(new, original)

In [193]:
xml_data = create_xml(protein_data)

{'id': 'G3DSA:3.40.640.10', 'name': 'Type I PLP-dependent aspartate aminotransferase-like (Major domain)', 'dbname': 'CATHGENE3D', 'status': 'T', 'model': '3a2bA02', 'type': 'Homologous_superfamily', 'evd': 'HMMPfam'}
{'id': 'G3DSA:3.90.1150.10', 'name': 'Aspartate Aminotransferase, domain 1', 'dbname': 'CATHGENE3D', 'status': 'T', 'model': '3a2bA01', 'type': 'Homologous_superfamily', 'evd': 'HMMPfam'}
{'id': 'TIGR01821', 'name': '5-aminolevulinate synthase', 'dbname': 'NCBIFAM', 'status': 'T', 'model': 'TIGR01821', 'type': 'Family', 'evd': 'HMMPfam'}
{'id': 'PF00155', 'name': 'Aminotransferase class I and II', 'dbname': 'PFAM', 'status': 'T', 'model': 'PF00155', 'type': 'Domain', 'evd': 'HMMPfam'}
{'id': 'SSF53383', 'name': 'PLP-dependent transferases', 'dbname': 'SSF', 'status': 'T', 'model': '0046747', 'type': 'Homologous_superfamily', 'evd': 'HMMPfam'}
{'id': 'cd06454', 'name': '', 'dbname': 'CDD', 'status': 'T', 'model': 'cd06454', 'type': 'Domain', 'evd': 'RPS-BLAST'}
{'id': 'PTH

In [194]:
standardize()

In [199]:
n, o, diffs = get_differences("processed.xml", "output.xml")

In [200]:
diffs

{}

In [197]:
open("differences.json", "w").write(json.dumps(d))

2