In [None]:
import os
import sys
import copy
import json
import argparse
import subprocess

import mysql.connector

from itertools import chain, combinations

In [None]:
sys.path.insert(0, "../")

In [None]:
from definitions import *
import api

In [None]:
CHESSDB_HOST = "localhost"
CHESSDB_USER = "chess_master"
CHESSDB_PASSWORD = "qwerty"
CHESSDB_PORT = "3306"
CHESSDB_NAME = "CHESS_DB"

chessApi = None
try:
    chessApi = api.CHESS_DB_API(CHESSDB_HOST, CHESSDB_USER, CHESSDB_PASSWORD, CHESSDB_NAME, CHESSDB_PORT)
    chessApi.connect()
except:
    print("Failed to connect to database")
    exit(1)

In [None]:
# Pseudocode for the SQL query
# tid     source            akey   avalue
# 1       "CHESS"        "key1"      "val4"
# 2       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 1       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 2       "CHESS"        "key1"      "val4"
# 1       "MANE"        "key1"      "val2"
# 1       "CHESS"        "key1"      "val4"
# 2       "CHESS"        "key1"      "val4"
# select first(*) from table order by tid, akey, ORDER BY FIELD(source, 'MANE', 'CHESS', 'RefSeq');
#---------------------------------------------------------------------------------------------------



from collections import defaultdict

# Define the list of source names for filtering
source_names = ['CHESS.3.0', 'RefSeq', 'MANE']

# Create the SQL query string
# SQL query using a subquery to reference the newAttributes table
query = '''SELECT na.*, t.sequenceID, t.start, t.end, t.strand, tx.score, t.exons 
FROM (
    SELECT *
    FROM (
        SELECT a.*, s.name AS source_name,
            ROW_NUMBER() OVER(PARTITION BY a.tid, a.name ORDER BY a.tid, a.name,
                CASE
                    WHEN s.name = 'CHESS.3.0' THEN 0
                    WHEN s.name = 'RefSeq' THEN 1
                    WHEN s.name = 'MANE' THEN 2
                    ELSE 3  -- Place other sources at the end
                END) AS rn
        FROM Attributes a
        JOIN Sources s ON a.sourceID = s.sourceID
    ) AS ranked
    WHERE rn = 1 AND (source_name = 'CHESS.3.0' OR source_name = 'RefSeq' OR source_name = 'MANE')
) na 
JOIN TxDBXREF tx ON na.tid = tx.tid AND na.sourceID = tx.sourceID AND na.transcript_id = tx.transcript_id 
JOIN Transcripts t ON tx.tid = t.tid'''

# Execute Query
select_res = chessApi.execute_query(query)
select_res

transcript_data = defaultdict(list) # automatically initializes a list for each key
for row in select_res:
    transcript_id = row[0]  # Assuming transcript_id is in the first position
    transcript_data[transcript_id].append(row)

# Writing the GTF file
# Output File Name
outfname = "new_results_gtf"
with open(outfname, "w") as outFP:
    if select_res is None or not select_res:
        print(f"No transcripts found for assembles {source_names}.")
        # write a dummy transcript to avoid errors downstream
        outFP.write("		transcript	0	0	.	+	.	transcript_id \"nan\";\n")
        outFP.write("		exon	0	0	.	+	.	transcript_id \"nan\";\n")
    else:
        for transcript_id, rows in transcript_data.items():
            gtf_str = ""
            # Construct transcript line
            attributes = "; ".join(f"{row[3]} {row[4]}" for row in rows)
            # print(attributes + "\n")
            strand = "+" if rows[0][10] == 1 else "-"
            gtf_str = f"{rows[0][7]}\t{rows[0][5]}\ttranscript\t{rows[0][8]}\t{rows[0][9]}\t.\t{strand}\t.\ttranscript_id \"{rows[0][2]}\"; {attributes}\n"

            outFP.write(gtf_str)

            # construct exon lines
            gtf_str = ""
            for exon in rows[0][12].split(","):
                exon_start, exon_end = [str(int(v)) for v in exon.split("-")]
                gtf_str += rows[0][7]+"\t"+str(rows[0][5])+"\texon\t"+exon_start+"\t"+exon_end+"\t.\t"+strand+"\t.\ttranscript_id \""+str(rows[0][2])+"\";\n"
            outFP.write(gtf_str)


            # indexes 0     1       2     3         4           5          6       7     8     9      10    11     12
            # values tid sourceId tx_id atr_name attr_value source_name sourceID seqId  start  end  strand score  exons

# chessApi.disconnect() # Preform when done working with the notebook