In [84]:
import oracledb
import xml.etree.ElementTree as ET
import pandas as pd
from collections import defaultdict


un = 'interpro'
cs = 'ora-dlvm-119.ebi.ac.uk:1521/IPREAD'
pw = "olymp"

db = oracledb.connect(user=un, password=pw, dsn=cs)
cursor = db.cursor()

In [85]:
sql = """WITH limited_protein AS (
    -- Limit the number of rows from the PROTEIN table
    SELECT PROTEIN_AC, NAME, DBCODE, CRC64, LEN,
           TO_CHAR(TIMESTAMP, 'YYYY-MM-DD') AS TIMESTAMP,
           FRAGMENT, TO_CHAR(TAX_ID) AS TAX_ID
    FROM INTERPRO.PROTEIN
    WHERE PROTEIN_AC = 'A0A009GKR3' -- Adjust this limit as needed
    ORDER BY PROTEIN_AC
),
limited_matches AS (
    -- Limit the number of rows from the MATCH and FEATURE_MATCH tables
    SELECT PROTEIN_AC, METHOD_AC, MODEL_AC, POS_FROM, POS_TO, FRAGMENTS, SCORE
    FROM INTERPRO.MATCH
)
SELECT P.PROTEIN_AC, P.NAME, P.DBCODE, P.CRC64, P.LEN, P.TIMESTAMP, P.FRAGMENT, P.TAX_ID,
       M.METHOD_AC, M.MODEL_AC, M.POS_FROM, M.POS_TO, M.FRAGMENTS, M.SCORE, MN.NAME
FROM limited_protein P
INNER JOIN limited_matches M
ON P.PROTEIN_AC = M.PROTEIN_AC
INNER JOIN INTERPRO.METHOD MN
ON M.METHOD_AC = MN.METHOD_AC
ORDER BY P.PROTEIN_AC
"""

protein_data = cursor.execute(sql)
protein_data = [
    [str(value) if value is not None else '' for value in row]
    for row in protein_data
]

In [121]:
def create_xml(protein_data):

    # Convert the protein_data list into a pandas DataFrame
    df = pd.DataFrame(protein_data, columns=[
        'protein_id', 'name', 'dbcode', 'crc64', 'length', 'timestamp', 'fragment', 'tax_id',
        'method_ac', 'model_ac', 'pos_from', 'pos_to', 'fragments', 'score', 'method_mn'
    ])
    
    # Group by protein_id and method_ac, then create a nested dictionary
    grouped = {}

    # Iterate over each row and populate the nested dictionary
    for _, row in df.iterrows():
        protein_id = row['protein_id']
        match_id = row['method_ac']
        location = {
            'start': row['pos_from'],
            'end': row['pos_to'],
            'fragments': row['fragments'],
            'score': row['score']
        }

        if (not(protein_id in grouped.keys())):
            grouped[protein_id] = {
                "info": {
                "id": row["protein_id"],
                "name": row["name"],
                "length": row["length"],
                "crc64": row["crc64"],
                }
            }
        

        # Add the match_id and its location under the protein_id
        if (not(match_id in grouped[protein_id].keys())):
            grouped[protein_id][match_id] = {
                "id": match_id,
                "name": row["method_mn"],
                "dbcode": row["dbcode"],
                "model": row["model_ac"],
                "locations": [location]
            }
        else:
            grouped[protein_id][match_id]["locations"].append(location)

    # Create the root element for XML
    root = ET.Element("proteins")

    # Iterate through the grouped data to create XML structure
    for protein_id, protein_data in grouped.items():
        # Extract the info for the protein
        info = protein_data["info"]
        
        # Create a protein element
        protein_elem = ET.SubElement(root, "protein", 
                                     id=info["id"], 
                                     name=info["name"], 
                                     length=str(info["length"]), 
                                     crc64=info["crc64"])
        
        # Iterate over matches under this protein
        for match_id, match_data in protein_data.items():
            if match_id == "info":
                continue  # Skip the info entry

            # Create a match element under the protein
            match_elem = ET.SubElement(protein_elem, "match", 
                                       id=match_data["id"], 
                                       name=match_data["name"], 
                                       dbname=match_data["dbcode"], 
                                       model=match_data["model"])

            # Create lcn elements for each location in the match
            for loc in match_data["locations"]:
                lcn_elem = ET.SubElement(match_elem, "lcn", 
                                         start=str(loc['start']), 
                                         end=str(loc['end']),
                                         fragments=str(loc['fragments']),
                                         score=str(loc['score']))

    # Create an XML tree and write it to a file
    tree = ET.ElementTree(root)
    tree.write("output.xml", encoding="utf-8", xml_declaration=True)

    print(grouped)



In [122]:
xml_data = create_xml(protein_data)

{'A0A009GKR3': {'info': {'id': 'A0A009GKR3', 'name': 'A0A009GKR3_9GAMM', 'length': '289', 'crc64': '6EE55E7A2BDC8AD1'}, 'G3DSA:2.60.40.10': {'id': 'G3DSA:2.60.40.10', 'name': '', 'dbcode': 'T', 'model': '5irbA02', 'locations': [{'start': '56', 'end': '155', 'fragments': '56-155-S', 'score': '4.8e-06'}, {'start': '161', 'end': '260', 'fragments': '161-260-S', 'score': '2.9e-06'}]}, 'PS00018': {'id': 'PS00018', 'name': 'EF_HAND_1', 'dbcode': 'T', 'model': 'PS00018', 'locations': [{'start': '273', 'end': '285', 'fragments': '273-285-S', 'score': '0.0'}]}}}
