In [5]:
import os
import sys
import copy
import json
import argparse
import subprocess

import mysql.connector

from itertools import chain, combinations

In [6]:
sys.path.insert(0, "../")

In [7]:
from definitions import *
import api

In [8]:
CHESSDB_HOST = "localhost"
CHESSDB_USER = "chess_master"
CHESSDB_PASSWORD = "qwerty"
CHESSDB_PORT = "3306"
CHESSDB_NAME = "CHESS_DB"

chessApi = None
try:
    chessApi = api.CHESS_DB_API(CHESSDB_HOST, CHESSDB_USER, CHESSDB_PASSWORD, CHESSDB_NAME, CHESSDB_PORT)
    chessApi.connect()
except:
    print("Failed to connect to database")
    exit(1)

Connected to MySQL database


In [22]:
####################
# Functionality Under Test
####################


""""
*** Data from all sources are being: ordered by tid --> ordered by attribute name --> then order by priority listed in the data_sources list.
*** The first row for each attribute name in each tid grouping is selected as the row to keep

Hypothesis:
I don't believe the SQL query accounts for cases where there is an attribute in a rank that is not 1, and this attribute is not in the others
This line needs to be changed:  WHERE rn = 1 AND (source_name = 'CHESS.3.0' OR source_name = 'RefSeq' OR source_name = 'MANE')

***Update: I was wrong. The query does account for this. The query is selecting the first row for each attribute name in each attribute grouping.


Database to create for testing:
- Should contain data from three sources: MANE, RefSeq, and CHESS.3.0
- Should contain attributes that are in all three sources, and attributes that are in only one or two sources

Questions:
- Are SQL queries I run on the database making permanent changes to the database? (e.g. if I DELETE a table, will it be gone in subsequent tests?)
"""

# sourceId
# 1 --> MANE, 2 --> RefSeq, 3 --> GENCODE, 4 --> CHESS.3.0

# Attributes in source 1 and 2
common_attr_name_two_sources = '''SELECT COUNT(DISTINCT name)
FROM Attributes
WHERE sourceId = '1'
AND name IN (SELECT DISTINCT name FROM Attributes WHERE sourceId = '2')
'''
# Attributes in MANE
single_source_attr_names_MANE= '''SELECT DISTINCT name FROM Attributes WHERE sourceId = '1' '''
# Attributes in GENCODE
single_source_attr_names_GENCODE= '''SELECT DISTINCT name FROM Attributes WHERE sourceId = '3' '''


# Attributes in source 1 but not 2 
diff_attr_name_two_sources = '''SELECT DISTINCT A1.name
FROM Attributes A1
LEFT JOIN Attributes A2 ON A1.name = A2.name AND A2.sourceId = '4'
WHERE A1.sourceId = '2' AND A2.name IS NULL'''

# attribute "anticodon" is uniquely in RefSeq but still in the GFF output
# --> attributes 

mane_attr = chessApi.execute_query(single_source_attr_names_MANE)
gencode_attr = chessApi.execute_query(single_source_attr_names_GENCODE)

In [31]:
gencode_attr

[('ccdsid',),
 ('gene_id',),
 ('gene_name',),
 ('gene_type',),
 ('havana_gene',),
 ('havana_transcript',),
 ('hgnc_id',),
 ('level',),
 ('protein_id',),
 ('tag',),
 ('transcript_id',),
 ('transcript_name',),
 ('transcript_support_level',),
 ('transcript_type',),
 ('ont',)]

In [33]:
####################
# Testing Functions
####################

# Check if the right data sources are in the file
def parse_gtf_and_check_data_sources(file_path, data_sources):
    sources_found = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue  # Skip comment lines
            columns = line.strip().split('\t')
            source_name = columns[1]
            sources_found.add(source_name)

    # Check if all expected sources are found and there are no extra sources
    all_sources_present = all(source in sources_found for source in data_sources)
    no_extra_sources = all(source in data_sources for source in sources_found)
    return all_sources_present and no_extra_sources, sources_found

# Check if attributes distinct to a source are in the file 
def find_missing_attributes_keys(source1_attr, source2_attr, file_path):
    # Convert attribute tuples to sets for easy comparison
    source1_set = set(attr[0] for attr in source1_attr)
    source2_set = set(attr[0] for attr in source2_attr)

    # Find attributes unique to GENCODE and MANE
    unique_to_source1 = source1_set - source2_set
    unique_to_source2 = source2_set - source1_set

    missing_attributes = set()

    # Parse the GTF file and check for missing attributes
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue  # Skip comment lines
            columns = line.strip().split('\t')
            attributes = columns[-1].split(';')

            for attribute in attributes:
                key_value = attribute.split(' ')[0]
                if key_value not in source1_set and key_value not in source2_set:
                    missing_attributes.add(key_value)

    
    return unique_to_source1, unique_to_source2, missing_attributes





In [34]:
####################
# Calling Tests
####################

####################
# CREATING THE GTF FILE
####################
# data_sources = ['GENCODE','MANE']
# keys = []
# values = []
# to_gtf_with_inputs(chessApi,data_sources,keys,values,"RefSeq","gtf","test1")


####################
# CHECKING THE OUTPUT
####################
correct_sources = ['MANE','GENCODE']
parse_gtf_and_check_data_sources("test1.gtf",correct_sources)

# Checking if all unique attribute keys of GENCODE and MANE are in the output
find_missing_attributes_keys(gencode_attr,mane_attr,"test1.gtf")

# more tests...


({'ccdsid',
  'havana_gene',
  'havana_transcript',
  'hgnc_id',
  'level',
  'ont',
  'transcript_support_level'},
 {'CDS_db_xref', 'db_xref'},
 {''})

In [None]:
# Main Query (for reference; without attribute selection in it)


query = '''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons 
FROM (
    SELECT *
    FROM (
        SELECT a.*, s.name AS source_name,
            ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                CASE
                    WHEN s.name = 'CHESS.3.0' THEN 0
                    WHEN s.name = 'RefSeq' THEN 1
                    WHEN s.name = 'MANE' THEN 2
                    ELSE 3  -- Place other sources at the end
                END) AS rn
        FROM Attributes a
        JOIN Sources s ON a.sourceID = s.sourceID
    ) AS ranked
    WHERE rn = 1 AND (source_name = 'CHESS.3.0' OR source_name = 'RefSeq' OR source_name = 'MANE')
) na 
JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
JOIN Transcripts t ON tx.tid = t.tid'''

select_res = chessApi.execute_query(a)
select_res

In [None]:
# Subsetting the database
# Remove all datapoints that are not in CHESS.3.0, RefSeq or MANE


In [3]:
#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
from collections import defaultdict

"""
  @brief: Returns a GTF/GFF file from the database using the given inputs

  @param: self: CHESS_DB_API object
  @param: data_sources: list of data sources to extract from the database (in order of priority)
  @param: keys: list of attribute keys/names
  @param: values: list of attribute values
  @param: nomenclature: nomenclature of the sequenceID
  @param: output_file_type: output file type ("gff" or "gtf")
  @param: output_file_name: output file name
  @return: GTF/GFF file that meets input criteria
"""
def to_gtf_with_inputs(self,data_sources:list,keys:list, values:list, nomenclature:str, output_file_type: str, output_file_name:str):
    
    ####################
    # SQL Query
    ####################

    # Create SQL clause for data sources
    sources_condition = " OR ".join([f"source_name = '{source}'" for source in data_sources])
    
    # Create SQL clause for attributes
    where_clause = ""
    if keys and values:
        conditions = []
        for key, value in zip(keys, values):
            conditions.append(f"(name = '{key}' AND value = '{value}')")
        where_clause = "WHERE " + " OR ".join(conditions)

    # SQL query
    query = f'''SELECT na.*, t.sequenceID, tx.start, tx.end, t.strand, tx.score, t.exons, tx.cds, t.assemblyName
    FROM (
        SELECT *
        FROM (
            SELECT a.*, s.name AS source_name,
                ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                    CASE {''.join([f"WHEN s.name = '{source}' THEN {index} " for index, source in enumerate(data_sources)])}
                    ELSE {len(data_sources)}  -- Place other sources at the end
                END) AS rn
            FROM Attributes a
            JOIN Sources s ON a.sourceID = s.sourceID
        ) AS ranked
        WHERE rn = 1 AND ({sources_condition})
    ) na 
    JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
    JOIN Transcripts t ON tx.tid = t.tid
    {where_clause}
    '''

    # Execute Query
    select_res = self.execute_query(query)
    

    
    # Find sequenceId for a given nomenclature
    # Execute the SQL query
    query = "SELECT assemblyName, sequenceID, nomenclature, alternativeID FROM SequenceIDMap"
    sid = chessApi.execute_query(query)

    # Create a dictionary mapping assemblyName, sequenceId, and nomenclature to alternativeID
    sequence_id_map = {}
    for row in sid:
        key = (row[0], row[1], row[2])  # Use assemblyName, sequenceID, and nomenclature as the key
        value = row[3]  # alternativeID as the value
        if key not in sequence_id_map:
            sequence_id_map[key] = []
        sequence_id_map[key].append(value)


    ####################
    # Create GFF/GTF file
    ####################

    # dictionary mapping transcript id to the row in the SQL query output
    transcript_data = defaultdict(list) # automatically initializes a list for each key
    for row in select_res:
        transcript_id = row[0]  # Assuming transcript_id is in the first position
        transcript_data[transcript_id].append(row)

    # Writing the GTF/GFF file
    outfname = f"{output_file_name}.{output_file_type}"

    with open(outfname, "w") as outFP:
        for transcript_id, rows in transcript_data.items():
            output_str = ""
            # Construct transcript line
            attribute_list = [(row[3], row[4].replace("\t", "")) for row in rows if row[3] not in ["transcript_id", "transcriptId"]]

            if (output_file_type=="gtf"):
                attributes = "; ".join(f"{attr[0]} \"{attr[1]}\"" for attr in attribute_list)

            if(output_file_type=="gff"):
                attributes = ";".join(f"{attr[0]}={attr[1]}" for attr in attribute_list)

            # print(attributes + "\n")
            strand = "+" if rows[0][10] == 1 else "-"

            # find sequenceId (assemblyId, sequenceId, nomencalture --> alternativeID)
            sequence_id = sequence_id_map[(rows[0][14], rows[0][7], nomenclature)][0]

            # main line for constucting the gtf/gff file
            if (output_file_type=="gtf"):
                output_str = f"{sequence_id}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

            if(output_file_type=="gff"):
                output_str = f"{sequence_id}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id={rows[0][2]};{attributes}\n"

            outFP.write(output_str)

            # construct exon lines
            output_str = ""

            # print exons
            if (rows[0][12] != None):
                for exon in rows[0][12].split(","):
                    exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                    output_str += sequence_id+"\t"+str(rows[0][5])+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
                
            # print CDS
            if (rows[0][13] != None):
                for cds in rows[0][13].split(","):
                    cds_start, cds_end = [str(int(v)) for v in cds.split("-")]
                    output_str += sequence_id+"\t"+str(rows[0][5])+"\tCDS\t"+cds_start+"\t"+cds_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"

            outFP.write(output_str)
    return




#####################################################
############  EXTRACT GTF WITH INPUTS    ############
#####################################################
