In [4]:
import os
import sys
import copy
import json
import argparse
import subprocess

import mysql.connector

from itertools import chain, combinations

In [5]:
sys.path.insert(0, "../")

In [6]:
from definitions import *
import api

In [7]:
CHESSDB_HOST = "localhost"
CHESSDB_USER = "chess_master"
CHESSDB_PASSWORD = "qwerty"
CHESSDB_PORT = "3306"
CHESSDB_NAME = "CHESS_DB"

chessApi = None
try:
    chessApi = api.CHESS_DB_API(CHESSDB_HOST, CHESSDB_USER, CHESSDB_PASSWORD, CHESSDB_NAME, CHESSDB_PORT)
    chessApi.connect()
except:
    print("Failed to connect to database")
    exit(1)

Connected to MySQL database


In [51]:
####################
# Functionality Under Test
####################


""""
*** Data from all sources are being: ordered by tid --> ordered by attribute name --> then order by priority listed in the data_sources list.
*** The first row for each attribute name in each tid grouping is selected as the row to keep

Hypothesis:
I don't believe the SQL query accounts for cases where there is an attribute in a rank that is not 1, and this attribute is not in the others
This line needs to be changed:  WHERE rn = 1 AND (source_name = 'CHESS.3.0' OR source_name = 'RefSeq' OR source_name = 'MANE')

***Update: I was wrong. The query does account for this. The query is selecting the first row for each attribute name in each attribute grouping.


Database to create for testing:
- Should contain data from three sources: MANE, RefSeq, and CHESS.3.0
- Should contain attributes that are in all three sources, and attributes that are in only one or two sources

Questions:
- Are SQL queries I run on the database making permanent changes to the database? (e.g. if I DELETE a table, will it be gone in subsequent tests?)
"""

# sourceId
# 1 --> MANE, 2 --> RefSeq, 3 --> GENCODE, 4 --> CHESS.3.0

# Attributes in source 1 and 2
common_attr_name_two_sources = '''SELECT COUNT(DISTINCT name)
FROM Attributes
WHERE sourceId = '1'
AND name IN (SELECT DISTINCT name FROM Attributes WHERE sourceId = '2')
'''
# Attributes in MANE
single_source_attr_names_MANE= '''SELECT DISTINCT name FROM Attributes WHERE sourceId = '1' '''
# Attributes in GENCODE
single_source_attr_names_GENCODE= '''SELECT DISTINCT name FROM Attributes WHERE sourceId = '3' '''


# Attributes in source 1 but not 2 
diff_attr_name_two_sources = '''SELECT DISTINCT A1.name
FROM Attributes A1
LEFT JOIN Attributes A2 ON A1.name = A2.name AND A2.sourceId = '4'
WHERE A1.sourceId = '2' AND A2.name IS NULL'''

tid_source_name_cds = '''SELECT tx.tid, s.name, tx.cds
FROM TxDBXREF tx
JOIN Sources s ON tx.sourceID = s.sourceID
WHERE s.name IN ('CHESS.3.0', 'MANE', 'RefSeq')'''


# attribute "anticodon" is uniquely in RefSeq but still in the GFF output
# --> attributes 
cds_output = chessApi.execute_query(tid_source_name_cds)
cds_dict = {}
for tid, source, cds in cds_output:
    cds_dict[(tid, source)] = cds

print(cds_dict)

{(1, 'CHESS.3.0'): '47393-48447,48940-49050,49129-49237,49501-49557', (2, 'CHESS.3.0'): '158699-158714,163308-163453,166787-166819,178933-179037,180236-180339,192842-192900,196637-196767,197616-197696,198047-198132,199202-199316,202880-202945,203098-203190,204564-204692,209971-210031,210386-210493,211133-211281', (3, 'CHESS.3.0'): '214629-214921,215429-215504,216486-216633,218886-218969,223440-223505,224084-224179,224924-224994,225089-225139,225337-225403,226801-226901,246324-246455,247849-247957,252539-252612,254273-254355,259180-259275,259682-259730,260186-260304,264026-264092,265303-265363,265457-265530,267966-268019', (4, 'CHESS.3.0'): '320048-320064,321662-321807,331668-331777,333007-333143,334742-335230,346295-347341,348065-348163,357400-357522,480707-480757,500508-500514', (5, 'CHESS.3.0'): '580409-580924', (6, 'CHESS.3.0'): '618001-618106,619213-619361,624865-625032,627097-627529,633298-633435,641327-641541,644910-645097,649898-649898', (7, 'CHESS.3.0'): '657743-657947,659641-6

In [33]:
####################
# Testing Functions
####################

# WORKS FOR OLD FUNCTION (before CHESS Database was written as the source name in the GTF/GFF)
# Check if the right data sources are in the file
def parse_gtf_and_check_data_sources(file_path, data_sources):
    sources_found = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue  # Skip comment lines
            columns = line.strip().split('\t')
            source_name = columns[1]
            sources_found.add(source_name)

    # Check if all expected sources are found and there are no extra sources
    all_sources_present = all(source in sources_found for source in data_sources)
    no_extra_sources = all(source in data_sources for source in sources_found)
    return all_sources_present and no_extra_sources, sources_found

# Check if attributes distinct to a source are in the file 
def find_missing_attributes_keys(source1_attr, source2_attr, file_path):
    # Convert attribute tuples to sets for easy comparison
    source1_set = set(attr[0] for attr in source1_attr)
    source2_set = set(attr[0] for attr in source2_attr)

    # Find attributes unique to GENCODE and MANE
    unique_to_source1 = source1_set - source2_set
    unique_to_source2 = source2_set - source1_set

    missing_attributes = set()

    # Parse the GTF file and check for missing attributes
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue  # Skip comment lines
            columns = line.strip().split('\t')
            attributes = columns[-1].split(';')

            for attribute in attributes:
                key_value = attribute.split(' ')[0]
                if key_value not in source1_set and key_value not in source2_set:
                    missing_attributes.add(key_value)

    
    return unique_to_source1, unique_to_source2, missing_attributes





In [None]:
####################
# Calling Tests
####################

####################
# CREATING THE GTF FILE
####################
# data_sources = ['GENCODE','MANE']
# keys = []
# values = []
# to_gtf_with_inputs(chessApi,data_sources,keys,values,"RefSeq","gtf","test1")


####################
# CHECKING THE OUTPUT
####################
correct_sources = ['MANE','GENCODE']
parse_gtf_and_check_data_sources("test1.gtf",correct_sources)

# Checking if all unique attribute keys of GENCODE and MANE are in the output
mane_attr = chessApi.execute_query(single_source_attr_names_MANE)
gencode_attr = chessApi.execute_query(single_source_attr_names_GENCODE)
find_missing_attributes_keys(gencode_attr,mane_attr,"test1.gtf")

# more tests...


In [9]:
# CALLING THE FUNCTION
data_sources = ['CHESS.3.0','RefSeq']
keys = []
values = []

to_gtf_with_inputs(chessApi,data_sources,keys,values,"GENBANK","gtf","another_ordered_list")

In [10]:
#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
from collections import defaultdict

"""
  @brief: Returns a GTF/GFF file from the database using the given inputs

  @param: self: CHESS_DB_API object
  @param: data_sources: list of data sources to extract from the database (in order of priority)
  @param: keys: list of attribute keys/names
  @param: values: list of attribute values
  @param: nomenclature: nomenclature of the sequenceID
  @param: output_file_type: output file type ("gff" or "gtf")
  @param: output_file_name: output file name
  @return: GTF/GFF file that meets input criteria
"""
def to_gtf_with_inputs(self,data_sources:list,keys:list, values:list, nomenclature:str, output_file_type: str, output_file_name:str):
    
    ####################
    # SQL Query
    ####################

    # Create SQL clause for data sources
    sources_condition = " OR ".join([f"source_name = '{source}'" for source in data_sources])
    
    # Create SQL clause for attributes
    where_clause = ""
    if keys and values:
        conditions = []
        for key, value in zip(keys, values):
            conditions.append(f"(name = '{key}' AND value = '{value}')")
        where_clause = "WHERE " + " OR ".join(conditions)

    # SQL query
    query = f'''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons, tx.cds, t.assemblyName
    FROM (
        SELECT *
        FROM (
            SELECT a.*, s.name AS source_name,
                ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                    CASE {''.join([f"WHEN s.name = '{source}' THEN {index} " for index, source in enumerate(data_sources)])}
                    ELSE {len(data_sources)}  -- Place other sources at the end
                END) AS rn
            FROM Attributes a
            JOIN Sources s ON a.sourceID = s.sourceID
        ) AS ranked
        WHERE rn = 1 AND ({sources_condition})
    ) na 
    JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
    JOIN Transcripts t ON tx.tid = t.tid
    {where_clause}
    '''

    # Execute Query
    select_res = self.execute_query(query)
    
    
    # Find sequenceId for a given nomenclature
    # Execute the SQL query
    query = "SELECT assemblyName, sequenceID, nomenclature, alternativeID FROM SequenceIDMap"
    sid = chessApi.execute_query(query)

    # Create a dictionary mapping assemblyName, sequenceId, and nomenclature to alternativeID
    sequence_id_map = {}
    for row in sid:
        key = (row[0], row[1], row[2])  # Use assemblyName, sequenceID, and nomenclature as the key
        value = row[3]  # alternativeID as the value
        if key not in sequence_id_map:
            sequence_id_map[key] = []
        sequence_id_map[key].append(value)


    # Query to get cds
    tid_source_name_cds = f'''SELECT tx.tid, s.name, tx.cds
    FROM TxDBXREF tx
    JOIN Sources s ON tx.sourceID = s.sourceID
    WHERE s.name IN {str(tuple(data_sources))}'''


    # create a dictionary mapping a tid and source_name to the cds
    cds_output = chessApi.execute_query(tid_source_name_cds)
    cds_dict = {}
    for tid, source, cds in cds_output:
        cds_dict[(tid, source)] = cds


    # dictionary mapping transcript id (tid) to the row in the SQL query output
    transcript_data = defaultdict(list) # automatically initializes a list for each key
    for row in select_res:
        transcript_id = row[0]  # Assuming transcript_id is in the first position
        transcript_data[transcript_id].append(row)

    # Dictionary to store distinct sources contributing attributes to each tid
    distinct_sources = {}

    # Iterate through the dictionary
    for idx, values in transcript_data.items():
        sources_set = set()  # Create a set to collect distinct sources for each index
        for data_tuple in values:
            source = data_tuple[5]  # Assuming index 5 contains the source information
            sources_set.add(source)
        distinct_sources[idx] = list(sources_set) 

    # order the lists in the dictionary
    order_sources = {source: index for index, source in enumerate(data_sources)}
    def get_order(item):
        return order_sources.get(item[5], float('inf'))  # If source not found, put it at the end

    # Sort the lists of tuples in the dictionary based on the fifth element of each tuple
    transcript_data = {key: sorted(value, key=get_order) for key, value in transcript_data.items()}

    ####################
    # Create GFF/GTF file
    ####################

    # Writing the GTF/GFF file
    outfname = f"{output_file_name}.{output_file_type}"

    with open(outfname, "w") as outFP:
        # For each transcript id (actually tid) and all the rows associated with it
        for transcript_id, rows in transcript_data.items():
            output_str = ""
            # Construct transcript line
            attribute_list = [(row[3], row[4].replace("\t", "")) for row in rows if row[3] not in ["transcript_id", "transcriptId"]]
            # Custom attribute
            attribute_list.append(("CHESS Database Transcript Sources", ", ".join(distinct_sources[transcript_id])))

            if (output_file_type=="gtf"):
                attributes = "; ".join(f"{attr[0]} \"{attr[1]}\"" for attr in attribute_list)

            if(output_file_type=="gff"):
                attributes = ";".join(f"{attr[0]}={attr[1]}" for attr in attribute_list)

            # print(attributes + "\n")
            strand = "+" if rows[0][10] == 1 else "-"

            # find sequenceId (assemblyId, sequenceId, nomencalture --> alternativeID)
            sequence_id = sequence_id_map[(rows[0][14], rows[0][7], nomenclature)][0]

            # main line for constucting the gtf/gff file
            if (output_file_type=="gtf"):
                output_str = f"{sequence_id}\tCHESS Database\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

            if(output_file_type=="gff"):
                output_str = f"{sequence_id}\tCHESS Database\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id={rows[0][2]};{attributes}\n"

            outFP.write(output_str)

            # construct exon lines
            output_str = ""

            # print exons
            if (rows[0][12] != None):
                for exon in rows[0][12].split(","):
                    exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                    output_str += sequence_id+"\t"+"CHESS Database"+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
                
            # print CDS
            cds_list = cds_dict[(rows[0][0], rows[0][5])]
            if (cds_list != None):
                for cds in cds_list.split(","):
                    cds_start, cds_end = [str(int(v)) for v in cds.split("-")]
                    output_str += sequence_id+"\t"+"CHESS Database"+"\tCDS\t"+cds_start+"\t"+cds_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"

            outFP.write(output_str)
    return




#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################





In [53]:
#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
from collections import defaultdict

"""
  @brief: Returns a GTF/GFF file from the database using the given inputs

  @param: self: CHESS_DB_API object
  @param: data_sources: list of data sources to extract from the database (in order of priority)
  @param: keys: list of attribute keys/names
  @param: values: list of attribute values
  @param: nomenclature: nomenclature of the sequenceID
  @param: output_file_type: output file type ("gff" or "gtf")
  @param: output_file_name: output file name
  @return: GTF/GFF file that meets input criteria
"""
def old_to_gtf_with_inputs(self,data_sources:list,keys:list, values:list, nomenclature:str, output_file_type: str, output_file_name:str):
    
    ####################
    # SQL Query
    ####################

    # Create SQL clause for data sources
    sources_condition = " OR ".join([f"source_name = '{source}'" for source in data_sources])
    
    # Create SQL clause for attributes
    where_clause = ""
    if keys and values:
        conditions = []
        for key, value in zip(keys, values):
            conditions.append(f"(name = '{key}' AND value = '{value}')")
        where_clause = "WHERE " + " OR ".join(conditions)

    # SQL query
    query = f'''SELECT na.*, t.sequenceID, tx.start, tx.end, t.strand, tx.score, t.exons, tx.cds, t.assemblyName
    FROM (
        SELECT *
        FROM (
            SELECT a.*, s.name AS source_name,
                ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                    CASE {''.join([f"WHEN s.name = '{source}' THEN {index} " for index, source in enumerate(data_sources)])}
                    ELSE {len(data_sources)}  -- Place other sources at the end
                END) AS rn
            FROM Attributes a
            JOIN Sources s ON a.sourceID = s.sourceID
        ) AS ranked
        WHERE rn = 1 AND ({sources_condition})
    ) na 
    JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
    JOIN Transcripts t ON tx.tid = t.tid
    {where_clause}
    '''

    # Execute Query
    select_res = self.execute_query(query)
    

    
    # Find sequenceId for a given nomenclature
    # Execute the SQL query
    query = "SELECT assemblyName, sequenceID, nomenclature, alternativeID FROM SequenceIDMap"
    sid = chessApi.execute_query(query)

    # Create a dictionary mapping assemblyName, sequenceId, and nomenclature to alternativeID
    sequence_id_map = {}
    for row in sid:
        key = (row[0], row[1], row[2])  # Use assemblyName, sequenceID, and nomenclature as the key
        value = row[3]  # alternativeID as the value
        if key not in sequence_id_map:
            sequence_id_map[key] = []
        sequence_id_map[key].append(value)


    ####################
    # Create GFF/GTF file
    ####################

    # dictionary mapping transcript id to the row in the SQL query output
    transcript_data = defaultdict(list) # automatically initializes a list for each key
    for row in select_res:
        transcript_id = row[0]  # Assuming transcript_id is in the first position
        transcript_data[transcript_id].append(row)

    # Writing the GTF/GFF file
    outfname = f"{output_file_name}.{output_file_type}"

    with open(outfname, "w") as outFP:
        for transcript_id, rows in transcript_data.items():
            output_str = ""
            # Construct transcript line
            attribute_list = [(row[3], row[4].replace("\t", "")) for row in rows if row[3] not in ["transcript_id", "transcriptId"]]

            if (output_file_type=="gtf"):
                attributes = "; ".join(f"{attr[0]} \"{attr[1]}\"" for attr in attribute_list)

            if(output_file_type=="gff"):
                attributes = ";".join(f"{attr[0]}={attr[1]}" for attr in attribute_list)

            # print(attributes + "\n")
            strand = "+" if rows[0][10] == 1 else "-"

            # find sequenceId (assemblyId, sequenceId, nomencalture --> alternativeID)
            sequence_id = sequence_id_map[(rows[0][14], rows[0][7], nomenclature)][0]

            # main line for constucting the gtf/gff file
            if (output_file_type=="gtf"):
                output_str = f"{sequence_id}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

            if(output_file_type=="gff"):
                output_str = f"{sequence_id}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id={rows[0][2]};{attributes}\n"

            outFP.write(output_str)

            # construct exon lines
            output_str = ""

            # print exons
            if (rows[0][12] != None):
                for exon in rows[0][12].split(","):
                    exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                    output_str += sequence_id+"\t"+str(rows[0][5])+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
                
            # print CDS
            if (rows[0][13] != None):
                for cds in rows[0][13].split(","):
                    cds_start, cds_end = [str(int(v)) for v in cds.split("-")]
                    output_str += sequence_id+"\t"+str(rows[0][5])+"\tCDS\t"+cds_start+"\t"+cds_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"

            outFP.write(output_str)
    return




#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
