In [37]:
import os
import sys
import copy
import json
import argparse
import subprocess

import mysql.connector

from itertools import chain, combinations

In [38]:
sys.path.insert(0, "../")

In [39]:
from definitions import *
import api

In [40]:
CHESSDB_HOST = "localhost"
CHESSDB_USER = "chess_master"
CHESSDB_PASSWORD = "qwerty"
CHESSDB_PORT = "3306"
CHESSDB_NAME = "CHESS_DB"

chessApi = None
try:
    chessApi = api.CHESS_DB_API(CHESSDB_HOST, CHESSDB_USER, CHESSDB_PASSWORD, CHESSDB_NAME, CHESSDB_PORT)
    chessApi.connect()
except:
    print("Failed to connect to database")
    exit(1)

Connected to MySQL database


In [None]:
# Pseudocode for the SQL query
# tid     source            akey   avalue
# 1       "CHESS"        "key1"      "val4"
# 2       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 1       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 2       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 1       "CHESS"        "key1"      "val4"
# 2       "CHESS"        "key1"      "val4"
# select first(*) from table order by tid, akey, ORDER BY FIELD(source, 'MANE', 'CHESS', 'RefSeq');
#---------------------------------------------------------------------------------------------------

from collections import defaultdict

# Create the SQL query string
# SQL query using a subquery to reference the newAttributes table
query = '''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons, tx.cds
FROM (
    SELECT *
    FROM (
        SELECT a.*, s.name AS source_name,
            ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                CASE
                    WHEN s.name = 'CHESS.3.0' THEN 0
                    WHEN s.name = 'RefSeq' THEN 1
                    WHEN s.name = 'MANE' THEN 2
                    ELSE 3  -- Place other sources at the end
                END) AS rn
        FROM Attributes a
        JOIN Sources s ON a.sourceID = s.sourceID
    ) AS ranked
    WHERE rn = 1 AND (source_name = 'CHESS.3.0' OR source_name = 'RefSeq' OR source_name = 'MANE')
) na 
JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
JOIN Transcripts t ON tx.tid = t.tid
'''


# Define the list of source names for filtering
data_sources = ['CHESS.3.0', 'RefSeq', 'MANE']
sources_condition = " OR ".join([f"source_name = '{source}'" for source in data_sources])
keys = ["gene_type"]
values = ["protein_coding"]

where_clause = ""
if keys and values:
    conditions = []
    for key, value in zip(keys, values):
        conditions.append(f"(name = '{key}' AND value = '{value}')")
    where_clause = "WHERE " + " OR ".join(conditions)

test_query = f'''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons, tx.cds
    FROM (
        SELECT *
        FROM (
            SELECT a.*, s.name AS source_name,
                ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                    CASE {''.join([f"WHEN s.name = '{source}' THEN {index} " for index, source in enumerate(data_sources)])}
                    ELSE {len(data_sources)}  -- Place other sources at the end
                END) AS rn
            FROM Attributes a
            JOIN Sources s ON a.sourceID = s.sourceID
        ) AS ranked
        WHERE rn = 1 AND ({sources_condition})
    ) na 
    JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
    JOIN Transcripts t ON tx.tid = t.tid
    {where_clause}
    '''


add_to_the_end = """WHERE (name = 'gene_type' AND value = 'protein_coding')"""

# Execute Query
select_res = chessApi.execute_query(test_query)
select_res


In [30]:
# dictionary mapping transcript id to the row in the SQL query output
transcript_data = defaultdict(list) # automatically initializes a list for each key
for row in select_res:
    transcript_id = row[0]  # Assuming transcript_id is in the first position
    transcript_data[transcript_id].append(row)

# Writing the GTF/GFF file
# Output File Name
output_file_type = "gff"
output_file_name = "test"

outfname = f"{output_file_name}.{output_file_type}"

with open(outfname, "w") as outFP:
    for transcript_id, rows in transcript_data.items():
        output_str = ""
        # Construct transcript line
        attribute_list = [(row[3], row[4].replace("\t", "")) for row in rows if row[3] != "transcript_id"]

        if (output_file_type=="gtf"):
            attributes = "; ".join(f"{attr[0]} \"{attr[1]}\"" for attr in attribute_list)

        if(output_file_type=="gff"):
            attributes = ";".join(f"{attr[0]}={attr[1]}" for attr in attribute_list)

        # print(attributes + "\n")
        strand = "+" if rows[0][10] == 1 else "-"

        # main line for constucting the gtf/gff file
        if (output_file_type=="gtf"):
            output_str = f"{rows[0][7]}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

        if(output_file_type=="gff"):
            output_str = f"{rows[0][7]}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id={rows[0][2]};{attributes}\n"

        outFP.write(output_str)

        # construct exon lines
        output_str = ""

        # print exons
        if (rows[0][12] != None):
            for exon in rows[0][12].split(","):
                exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                output_str += rows[0][7]+"\t"+str(rows[0][5])+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
            
        # print CDS
        if (rows[0][13] != None):
            for cds in rows[0][13].split(","):
                cds_start, cds_end = [str(int(v)) for v in cds.split("-")]
                output_str += rows[0][7]+"\t"+str(rows[0][5])+"\tCDS\t"+cds_start+"\t"+cds_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"

        outFP.write(output_str)


        # indexes 0     1       2     3         4           5          6       7     8     9      10    11     12     13
        # values tid sourceId tx_id atr_name attr_value source_name sourceID seqId  start  end  strand score  exons  cds

# chessApi.disconnect() # Preform when done working with the notebook

In [42]:
#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
from collections import defaultdict

"""
  @brief: Returns a GTF/GFF file from the database using the given inputs

  @param: self: CHESS_DB_API object
  @param: data_sources: list of data sources to extract from the database (in order of priority)
  @param: keys: list of attribute keys/names
  @param: values: list of attribute values
  @param: nomenclature: nomenclature of the sequenceID
  @param: output_file_type: output file type ("gff" or "gtf")
  @param: output_file_name: output file name
  @return: GTF/GFF file that meets input criteria
"""
def to_gtf_with_inputs(self,data_sources:list,keys:list, values:list, nomenclature:str, output_file_type: str, output_file_name:str):
    
    ####################
    # SQL Query
    ####################

    # Create SQL clause for data sources
    sources_condition = " OR ".join([f"source_name = '{source}'" for source in data_sources])
    
    # Create SQL clause for attributes
    where_clause = ""
    if keys and values:
        conditions = []
        for key, value in zip(keys, values):
            conditions.append(f"(name = '{key}' AND value = '{value}')")
        where_clause = "WHERE " + " OR ".join(conditions)

    # SQL query
    query = f'''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons, tx.cds, t.assemblyName
    FROM (
        SELECT *
        FROM (
            SELECT a.*, s.name AS source_name,
                ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                    CASE {''.join([f"WHEN s.name = '{source}' THEN {index} " for index, source in enumerate(data_sources)])}
                    ELSE {len(data_sources)}  -- Place other sources at the end
                END) AS rn
            FROM Attributes a
            JOIN Sources s ON a.sourceID = s.sourceID
        ) AS ranked
        WHERE rn = 1 AND ({sources_condition})
    ) na 
    JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
    JOIN Transcripts t ON tx.tid = t.tid
    {where_clause}
    '''

    # Execute Query
    select_res = self.execute_query(query)
    

    
    # Find sequenceId for a given nomenclature
    # Execute the SQL query
    query = "SELECT assemblyName, sequenceID, nomenclature, alternativeID FROM SequenceIDMap"
    sid = chessApi.execute_query(query)

    # Create a dictionary mapping assemblyName, sequenceId, and nomenclature to alternativeID
    sequence_id_map = {}
    for row in sid:
        key = (row[0], row[1], row[2])  # Use assemblyName, sequenceID, and nomenclature as the key
        value = row[3]  # alternativeID as the value
        if key not in sequence_id_map:
            sequence_id_map[key] = []
        sequence_id_map[key].append(value)


    ####################
    # Create GFF/GTF file
    ####################

    # dictionary mapping transcript id (tid) to the row in the SQL query output
    transcript_data = defaultdict(list) # automatically initializes a list for each key
    for row in select_res:
        transcript_id = row[0]  # Assuming transcript_id is in the first position
        transcript_data[transcript_id].append(row)



    # Dictionary to store distinct sources for each transcript Id
    distinct_sources = {}

    # Iterate through the dictionary
    for idx, values in transcript_data.items():
        sources_set = set()  # Create a set to collect distinct sources for each index
        for data_tuple in values:
            source = data_tuple[5]  # Assuming index 5 contains the source information
            sources_set.add(source)
        distinct_sources[idx] = list(sources_set) 


    # Writing the GTF/GFF file
    outfname = f"{output_file_name}.{output_file_type}"

    with open(outfname, "w") as outFP:
        # For each transcript id (actually tid) and all the rows associated with it
        for transcript_id, rows in transcript_data.items():
            output_str = ""
            # Construct transcript line
            attribute_list = [(row[3], row[4].replace("\t", "")) for row in rows if row[3] not in ["transcript_id", "transcriptId"]]
            attribute_list.append(("CHESS Database Transcript Sources", ", ".join(distinct_sources[transcript_id])))

            if (output_file_type=="gtf"):
                attributes = "; ".join(f"{attr[0]} \"{attr[1]}\"" for attr in attribute_list)

            if(output_file_type=="gff"):
                attributes = ";".join(f"{attr[0]}={attr[1]}" for attr in attribute_list)

            # print(attributes + "\n")
            strand = "+" if rows[0][10] == 1 else "-"

            # find sequenceId (assemblyId, sequenceId, nomencalture --> alternativeID)
            sequence_id = sequence_id_map[(rows[0][14], rows[0][7], nomenclature)][0]

            # main line for constucting the gtf/gff file
            if (output_file_type=="gtf"):
                output_str = f"{sequence_id}\tCHESS Database\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

            if(output_file_type=="gff"):
                output_str = f"{sequence_id}\tCHESS Database\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id={rows[0][2]};{attributes}\n"

            outFP.write(output_str)

            # construct exon lines
            output_str = ""

            # print exons
            if (rows[0][12] != None):
                for exon in rows[0][12].split(","):
                    exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                    output_str += sequence_id+"\t"+"CHESS Database"+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
                
            # print CDS
            if (rows[0][13] != None):
                for cds in rows[0][13].split(","):
                    cds_start, cds_end = [str(int(v)) for v in cds.split("-")]
                    output_str += sequence_id+"\t"+"CHESS Database"+"\tCDS\t"+cds_start+"\t"+cds_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"

            outFP.write(output_str)
    return




#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################





In [43]:
# CALLING THE FUNCTION
data_sources = ['CHESS.3.0','RefSeq','MANE']
keys = []
values = []

to_gtf_with_inputs(chessApi,data_sources,keys,values,"GENBANK","gtf","NORMAL")

In [17]:
# Add on: sequenceId map

# Execute the SQL query
query = "SELECT assemblyName, sequenceID, nomenclature, alternativeID FROM SequenceIDMap"
sid = chessApi.execute_query(query)

# Create a dictionary mapping assemblyName, sequenceId, and nomenclature to alternativeID
sequence_id_map = {}
for row in sid:
    key = (row[0], row[1], row[2])  # Use assemblyName, sequenceID, and nomenclature as the key
    value = row[3]  # alternativeID as the value
    if key not in sequence_id_map:
        sequence_id_map[key] = []
    sequence_id_map[key].append(value)

print(sequence_id_map)


{('CHM13', 'chr18', 'UCSC'): ['chr18'], ('CHM13', 'chr18', 'GENBANK'): ['CP068260.2'], ('CHM13', 'chr18', 'RefSeq'): ['NC_060942.1'], ('CHM13', 'chr19', 'UCSC'): ['chr19'], ('CHM13', 'chr19', 'GENBANK'): ['CP068259.2'], ('CHM13', 'chr19', 'RefSeq'): ['NC_060943.1'], ('GRCh38', 'chr18', 'UCSC'): ['chr18'], ('GRCh38', 'chr18', 'GENBANK'): ['CM000680.2'], ('GRCh38', 'chr18', 'RefSeq'): ['NC_000018.10'], ('GRCh38', 'chr19', 'UCSC'): ['chr19'], ('GRCh38', 'chr19', 'GENBANK'): ['CM000681.2'], ('GRCh38', 'chr19', 'RefSeq'): ['NC_000019.10']}
