In [None]:
# creation of database file and connection etc, imports
import sqlite3
import requests, json
from requests import ConnectionError
import itertools, time
from tqdm import tqdm
import re
from subprocess import Popen, PIPE
from collections import Counter
import numpy as np
from math import log10
from gprofiler import GProfiler
from upsetplot import from_contents
from upsetplot import plot as uplot
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.stats import mannwhitneyu
import colorsys
db = sqlite3.connect('./finalDupCompDatabase')
db.isolation_level = None
cursor = db.cursor()

# Ohnolog dataset generation from macrosynteny segments

In [None]:
#create list of segment coordinates to obtain gene lists with current assembly (lifting GrCH37 to 38) and nc genes
segCoordDict38 = {}
segGenesAll = {}
segGenesProt = {}
excludedGenes = []
with open('segments/map/HUMAN.txt') as file:
    for line in tqdm(file):
        line = line.strip('\n').split('\t')
        seg, chrom, s37, e37 = line
        print(seg)
        print('Converting to GrCh38...')
        #convert to GrCh38
        headers = {'Content-Type':'application/json'}
        query1 = "http://rest.ensembl.org/map/human/GRCh37/"+chrom +":"+ s37 + ".." + e37 + ":1/GRCh38?"
        r = requests.get(query1, headers=headers)
       # qs += 1
        
        outputMap = r.json()['mappings']
        outputMap.sort(key= lambda x: x['mapped']['start']) #fairly sure the segs are in order already but anyways
        #set start as start of first mapped region, end as end of last mapped region, should work for single mappings too
        s = outputMap[0]['mapped']['start']
        e = outputMap[-1]['mapped']['end']
        segCoordDict38[seg] = (chrom,s,e)
        #check if we're over the length filter and divide ino chunks
        if int(e)-int(s) > 500000:
            geneOutput = []
            nextStart = int(s)
            nextEnd = int(s) + 500000
            while nextEnd < int(e):
#                 print('dividing')
#                 print(nextStart,nextEnd)
                query = "http://rest.ensembl.org/overlap/region/human/"+ chrom + ":" + str(nextStart) + "-" + str(nextEnd) + "?feature=gene"
                r = requests.get(query, headers=headers)
                #qs += 1
                geneOutput.extend(r.json())
                nextStart = nextEnd+1
                nextEnd = nextStart + 500000 #move on to next subsegment
            else:
                query = "http://rest.ensembl.org/overlap/region/human/"+ chrom + ":" + str(nextStart) + "-" + str(e) + "?feature=gene"
                r = requests.get(query, headers=headers)
                #qs += 1
                geneOutput.extend(r.json())
        else:
            query = "http://rest.ensembl.org/overlap/region/human/"+ chrom + ":" + str(s) + "-" + str(e) + "?feature=gene"
            r = requests.get(query, headers=headers)
            #qs +=1
            geneOutput = r.json()
        #make sure output is ordered by coordinates
        geneOutput.sort(key= lambda x: min([x['start'],x['end']]))
        protGenes = []
        allGenes = []
        lastEnd = 0
        lastStrand = ''
        doneList = [] #for duplicate entries, prob coming from spanning two chunks
        print('Moved on to filtering genes...')
        for pos,gene in enumerate(geneOutput):
            if gene['id'] in doneList:
                continue
            s,e,strand = gene['start'],gene['end'],gene['strand']
            if strand == lastStrand: #check for overlaps if on the same strand
                try:
                    nextStart = geneOutput[pos+1]['start']
                #intronic/other sitch where it's fully inside another gene?
                    if s <= lastEnd and e<= lastEnd: #current gene is entirely within the last one
                        excludedGenes.append(gene['id'])
                        lastEnd = e
                        lastStrand = strand
                #readthrough
                    elif s <= lastEnd and e >= nextStart: #current gene overlaps last and next gene
                        excludedGenes.append(gene['id'])
                        lastEnd = e
                        lastStrand = strand
                    else:
                        if gene['biotype'] == 'protein_coding':
                            protGenes.append(gene['id'])
                            allGenes.append(gene['id'])
                        else:
                            allGenes.append(gene['id'])
                        lastEnd = e
                        lastStrand = strand
                except IndexError: #last gene, intron check only
                    if s <= lastEnd and e <= lastEnd:
                        excludedGenes.append(gene['id'])
                        lastEnd = e
                        lastStrand = strand
                    else:
                        if gene['biotype'] == 'protein_coding':
                            protGenes.append(gene['id'])
                            allGenes.append(gene['id'])
                        else:
                            allGenes.append(gene['id'])
                        lastEnd = e
                        lastStrand = strand
            else: #last gene was on the other strand, we don't need to worry about overlaps
                if gene['biotype'] == 'protein_coding':
                    protGenes.append(gene['id'])
                    allGenes.append(gene['id'])
                else:
                    allGenes.append(gene['id'])
                
                lastEnd = e
                lastStrand = strand
            doneList.append(gene['id'])
        #I don't particularly think this is necessary but I must have been using set for a reason???
        segGenesAll[seg] = []
        segGenesProt[seg] = []
        for gene in allGenes:
            if gene not in segGenesAll[seg]:
                segGenesAll[seg].append(gene)
        for gene in protGenes:
            if gene not in segGenesProt[seg]:
                segGenesProt[seg].append(gene)
#         segGenesAll[seg] = list(set(allGenes))
#         segGenesProt[seg] = list(set(protGenes)) #we're NOT going to call set and ruin all the effort of getting the order right

with open('genesInSeg_all.txt','w') as file:
    for seg in segGenesAll:
        line = seg + '\t' + ','.join(segGenesAll[seg]) + '\n'
        file.write(line)
with open('genesInSeg_coding.txt','w') as file:
    for seg in segGenesProt:
        line = seg + '\t' + ','.join(segGenesProt[seg]) + '\n'
        file.write(line)

In [None]:
#ohnolog assignment
t = time.time()
print('Putting together input segments')
segGenes = {}
with open('genesInSeg_coding_fixed_readthroughs.txt','r') as file:
    for line in file:
        line = line.strip('\n').split('\t')
        segGenes[line[0]] = line[1].split(',')
# #create list of all included genes because I have a feeling the paralogs file included nc genes
incGenes = []
for seg in segGenes:
    incGenes.extend(segGenes[seg])
# #if synteny is reversed for any reason (eg inversion) then ohnologs can be missed, add in reversal of each segment
orSegs = [x for x in segGenes.keys()]
for seg in orSegs:
    revList = segGenes[seg][::-1]
    segGenes[seg+'R'] = revList
def convert_prot_gene(prot_ID):
    import time, requests, json
#     t = time.time()
    headers = {'Content-Type':'application/json'}
    url = 'http://rest.ensembl.org/lookup/' + prot_ID + '?'
    r = requests.get(url, headers = headers)
    if 'error' in r.json().keys(): #prot id not found
        raise ValueError('id deprecated')
    transID = r.json()['Parent']
    
    url = 'http://rest.ensembl.org/lookup/' + transID + '?'
    r = requests.get(url, headers = headers)
    geneID = r.json()['Parent']

    return geneID
def get_segment(gene_id):
    if 'ENSP' in gene_id: #id is actually a prot id
        gene_id = convert_prot_gene(gene_id)
    try:
        loc = [seg for seg in segGenes if gene_id in segGenes[seg]]
    except IndexError:
        loc = None
    return loc

paralogs = []
with open('ensemblParalogsv99.txt','r') as file:
    file.readline() #header
    for line in file:
        line = line.strip('\n').split('\t')
        if line[0] != '' and line[1] != '' and line[0] in incGenes and line[1] in incGenes:
            paralogs.append((line[0],line[1]))
possOhnos = []
segCount = 0
# gapAllowed = 5
lastGene = ''
print('Prep done, let\'s go!')
print('Prep took:', round(time.time()-t,2), 'seconds!')

#iterate over genes in segments
t = time.time()
for gapAllowed in [8,10]:
    minBlockSize = 3
    possOhnos = []
    for seg in segGenes:
    #     seg = 'Human_4'
        #no need to iterate over the reversed segments, they just need to be queried
        if 'R' in seg:
            continue
        print(seg)
        geneList = segGenes[seg]
        geneCount = 0
    #     geneList = seg1
        for gene in tqdm(geneList):
            pairsList = [x for x in paralogs if gene in x]
            #if it's a singleton, continue
            if len(pairsList) == 0:
                geneCount += 1
                continue
            else: #has paralogs
                paras = []
                for pair in pairsList: #append the paralog out of each pair
                    if pair[0] == gene:
                        paras.append(pair[1])
                    elif pair[1] == gene:
                        paras.append(pair[0])
            for para in set(paras): #can't be bothered checking if paralog pairs are reciprocal, prob are but just in case
                if para == '': #deal with empty strings from (I think) trailing commas with .split()
                    continue
                added = False
                #print('paralog:',para)
                blockBroken = False
                blockSize = 0
                #find segments the paralogs are on
                segList = get_segment(para) #adjusted to return seg and it's reverse
                #print(segList)
                if segList: #if the paralog is in a segment
                    for seg2 in segList:
                        if seg2 != seg and seg2 != seg+'R': #paralog in segment and it's not this one or it's reversal

                            #fetch the location of this paralog in its own segment
            #                 seg2 = get_segment(para)
                            pos = segGenes[seg2].index(para)

                            #look at genes either side of this focal pair in the two segments
                            #left search
                            order1 = 1 #vars defining where we are in the current search: starts one above or below focal position
                            order2 = 1
                            noMatch1 = 0 #vars tracking size of gap currently open in each seg
                            noMatch2 = 0

                             #start a new synteny block
                            blockBroken = False
                            #vars for tracking the size of the current block to see if it meets minimum before it breaks
                            currentPairs = [[gene,para]]
                            currentMatches = 1 #not the same as number of pairs because one focal to two para (tandem dup) is only 1 syntenic match really

                            while blockBroken == False:
                                #catch index errors from segment ends
                                if geneCount - order1 < 0: #focal seg left side exhausted, break
            #                         print('went off left end of focal seg: broke')
                                    if currentMatches >= minBlockSize:
                                        possOhnos.extend(currentPairs)
                                    break
                            
                                elif pos - order2 < 0 and noMatch1 == gapAllowed: #para seg left end reached, no more focal genes left to test
            #                         print('went off left end and no focal genes left:broke')
                                    if currentMatches >= minBlockSize:
                                        possOhnos.extend(currentPairs)
                                    break
                                elif pos - order2 < 0 and noMatch2 == 0: #the last gene was an ohnolog, so we'd have to move on to the next para gene, which doesn't exist
            #                         print("we've run out of para seg, last gene was an ohnolog")
                                    if currentMatches >= minBlockSize:
                                        possOhnos.extend(currentPairs)

                                    break
                                elif pos - order2 < 0: #if more focal genes to test, skip to the next one that doesn't have a match
            #                         print('went off left end and there were more focal genes: continue')
                                    order1 += 1
                                    order2 = 1
                                    noMatch2 = 0
                                    noMatch1 += 1
                                    continue
                                else: #define the test genes if no edge case issues to be dealt with
                                    testGene1 = segGenes[seg][geneCount-order1]
                                    testGene2 = segGenes[seg2][pos-order2]
                                #trying to catch post-WGD tandem duplication, could otherwise create an artificial gap
                                #should only come into play if last gene was classed an ohnolog, lastGene should otherwise be ''
                                #if it is a paralog of the last gene then add the pair to the ohno list and increment the para position
                                if (lastGene, testGene2) in paralogs or (testGene2, lastGene) in paralogs:
                                    #is this a post or pre WGD tandem duplication?
                                    #case where there's a sort of square relationship set between 4 genes
                                    if (testGene1, testGene2) in paralogs or (testGene2, testGene1) in paralogs: #also a paralog of the current test gene, possibly a pre-WGD dup
                                        currentPairs.append([testGene1,testGene2])
                                        currentMatches += 1
                                        #move on to next pair, current focal test becomes 'lastGene' in testing for tandems
                                        order1 += 1
                                        order2 += 1
                                        noMatch1 = 0
                                        noMatch2 = 0
                                        lastGene = testGene1
                                        continue
                                    else:
                                        currentPairs.append([lastGene,testGene2])
                                        #no increment, this isn't a new match, it's a post-WGD duplication
                                        order2 +=1
                                        noMatch1 = 0
                                        noMatch2 = 0
                                        #move on to next para seg gene, lastGene remains the same as we assume a post-WGD tandem in para seg
                                        continue
                                #if there were tandems detected the loop won't reach this point: these are more straightforward
                                if (testGene1,testGene2) in paralogs or (testGene2,testGene1) in paralogs:
                                    #add to table, ohnolog pair
                                    currentPairs.append([testGene1,testGene2])
                                    currentMatches += 1
                                    order1 += 1
                                    order2 += 1
                                    noMatch1 = 0
                                    noMatch2 = 0
                                    #store current gene to test for paralogy in next loop so I don't have to do the edge handling twice
                                    lastGene = testGene1

                                    #maybe keep track of pairs added to avoid redundancy? Has to be a way not to re-hash the same regions over and over
                                else:
                                    noMatch2 += 1
                                    order2 += 1
                                    lastGene = '' #only look at expanding a tandem block if the last focal gene was an ohnolog
                                if noMatch2 >= gapAllowed + 1: #gap of over 'gap allowed' number of genes if continue along para seg, move on to new gene on focal seg
                                    order1 += 1
                                    order2 = order2 - noMatch2
                                    noMatch2 = 0
                                    noMatch1 += 1
                                if noMatch1 >= gapAllowed + 1: #all genes within allowed gap from initial match have been checked on this side
                                    blockBroken = True
                                    if currentMatches >= minBlockSize: #check on block size before adding pairs from current block
                                        possOhnos.extend(currentPairs)

                        #right search
                            order1 = 1
                            order2 = 1
                            noMatch1 = 0
                            noMatch2 = 0
                            blockBroken = False
                            #vars for tracking the size of the current block to see if it meets minimum before it breaks
                            currentPairs = [[gene,para]]
                            currentMatches = 1 #not the same as number of pairs because one focal to two para (tandem dup) is only 1 syntenic match really

                            while blockBroken == False:
                                #catch index errors: checking if we've fallen off the right end
                                try:
                                    testGene1 = segGenes[seg][geneCount+order1]
                                except IndexError: #focal seg right side exhausted, break
    #                                 print('went off the right end of the focal seg:broke')
                                    if currentMatches >= minBlockSize:
                                        possOhnos.extend(currentPairs)

                                    break
                                try:
                                    testGene2 = segGenes[seg2][pos+order2]
                                except IndexError: #gone off the right end of the para seg, but if there are still focal genes to be tested can't just break out of the loop
                                    #check if there's anything left to test
                                    if noMatch1 == gapAllowed: #i.e. this is the 3rd focal gene tested with no matches found and we've now run out of para seg genes to test
    #                                     print('Went off right end and no focal genes left:broke')
                                        if currentMatches >= minBlockSize:
                                            possOhnos.extend(currentPairs)

                                        break
                                    elif noMatch2 == 0: #last para seg gene was an ohnolog
    #                                     print('last para seg gene was assigned as ohnolog, have to break')
                                        if currentMatches >= minBlockSize:
                                            possOhnos.extend(currentPairs)

                                        break
                                    else: #otherwise do normal change to next focal seg gene, go to next iteration
    #                                     print('Went off right end with focal genes left: continue')
                                        order1 += 1
                                        order2 = order2 - noMatch2 #set para test position back to the last unmatched gene
                                        noMatch2 = 0
                                        noMatch1 += 1
                                        continue
                                #check for tandem, increment and continue if seems likely
                                if (lastGene, testGene2) in paralogs or (testGene2, lastGene) in paralogs:
                                    #is this a post or pre WGD tandem duplication?
                                    if (testGene1, testGene2) in paralogs or (testGene2, testGene1) in paralogs: #also a paralog of the current test gene, possibly a pre-WGD dup
                                        currentPairs.append([testGene1,testGene2])
                                        currentMatches += 1
                                        order1 += 1
                                        order2 += 1
                                        noMatch1 = 0
                                        noMatch2 = 0
                                        lastGene = testGene1
                                    else:
                                        currentPairs.append([lastGene,testGene2])
                                        order2 +=1
                                        continue
                                else:
                                    lastGene = '' #no tandem block detected, want to make sure we're not comparing this gene ten genes down the seg
                                if (testGene1,testGene2) in paralogs or (testGene2,testGene1) in paralogs:
                                    currentPairs.append([testGene1,testGene2])
                                    currentMatches += 1
                                    order1 += 1
                                    order2 += 1
                                    noMatch1 = 0
                                    noMatch2 = 0
                                    lastGene = testGene1
                                else:
                                    noMatch2 += 1
                                    order2 += 1
                                    lastGene = ''
                                if noMatch2 >= gapAllowed + 1: #gap of over 2 genes if continue along para seg, move on to new gene on focal seg
                                    order1 += 1
                                    order2 = order2 - noMatch2
                                    noMatch2 = 0
                                    noMatch1 += 1
                                if noMatch1 >= gapAllowed + 1: #all genes within a gap of 2 from initial match have been checked on this side
                                    blockBroken = True
                                    if currentMatches >= minBlockSize:
                                        possOhnos.extend(currentPairs)


                        else:
            #                 print('same seg')
                            pass
                            
            geneCount += 1
        segCount += 1
        print('Time elapsed:',round(time.time()-t,2)/60,'minutes!')
    done = []
    outFile = 'ohnologsWholeGenome_gapAllowed' + str(gapAllowed) + '_minBlock'+str(minBlockSize)+'.txt'
    with open(outFile,'w') as file:
        for x in possOhnos:
            if x not in done:
                done.append(x)
                file.write('\t'.join(x)+'\n')
    print('Finished this round! Pairs written to: ',outFile)

# Data import and cleaning/processing

In [None]:
# reading in ohnolog sets
cursor.execute('CREATE TABLE IF NOT EXISTS ohnologs_2010(id1,id2)')
cursor.execute('CREATE TABLE IF NOT EXISTS ohnologs_singh1(id1,id2)')
cursor.execute('CREATE TABLE IF NOT EXISTS ohnologs_singh2(id1,id2)')
cursor.execute('CREATE TABLE IF NOT EXISTS ohnologs_singh3(id1,id2)')

with open('ohnologs_2010.csv','r') as file:
    file.readline()
    file.readline()
    for line in file:
        line = line.strip('\n').split(',')
        ohno1, ohno2 = line[0],line[3]
        cursor.execute('INSERT INTO ohnologs_2010 VALUES(?,?)',(ohno1,ohno2))

with open('hsapiens.Pairs.Relaxed.2R.txt','r') as file1, \
    open('hsapiens.Pairs.Intermediate.2R.txt','r') as file2, \
    open('hsapiens.Pairs.Strict.2R.txt','r') as file3:
    
    for file,table in zip([file1,file2,file3],['ohnologs_singh1', 'ohnologs_singh2', 'ohnologs_singh3']):
        file.readline()
        for line in file:
            line = line.strip('\n').split('\t')
            ohno1, ohno2 = line[:2]
            cursor.execute('INSERT INTO '+table+' VALUES(?,?)',(ohno1,ohno2))
            
cursor.execute('CREATE TABLE ohnologs_2020_8(id1 TEXT, id2 TEXT)')
with open('ohnologsWholeGenome_gapAllowed8_minBlock3.txt') as file:
    for line in file:
        line = line.strip('\n').split('\t')
        cursor.execute('INSERT INTO ohnologs_2020_8 VALUES(?,?)',(line[0],line[1]))
db.commit()

In [None]:
# entry of all genes to be considered, the straightforward downloadable-from-Ensembl features

headers = {'Content-Type':'application/json'}
cursor.execute('DROP TABLE IF EXISTS gene_features')
cursor.execute('''CREATE TABLE IF NOT EXISTS gene_features(id TEXT, trans_id TEXT, prot_id TEXT, name TEXT, 
                                                           chrom TEXT, start TEXT, end TEXT, strand TEXT, TSS INTEGER,
                                                           genLen INTEGER, cdsLen INTEGER, transLen INTEGER,
                                                           transCount INTEGER, intCount INTEGER, intCov REAL, intAvg REAL,
                                                           gc REAL, gc3 REAL, domains INTEGER, u_domains INTEGER)''')
geneMultiValDict = {}
geneMultiValDict2 = {}
with open('ensemblFeaturesv99.txt','r') as file: #initial dict has multiple lists of features for each gene due to multiple transcripts
    print(file.readline().strip('\n').split('\t'))
    for line in file:
        line = line.strip('\n').split('\t') 
        gene = line[0]
        if gene in geneMultiValDict.keys():
            geneMultiValDict[gene].append(line[1:])
        else:
            geneMultiValDict[gene] = [line[1:]]

with open('ensemblFeaturesv99_2.txt','r') as file:
    print(file.readline().strip('\n').split('\t'))
    for line in file:
        line = line.strip('\n').split('\t') 
        gene = line[0]
        if gene in geneMultiValDict2.keys():
            geneMultiValDict2[gene].append(line[1:])
        else:
            geneMultiValDict2[gene] = [line[1:]]

for gene in tqdm(geneMultiValDict):
#         trans, prot, chr, start, end, tss, tlen, strand, name, clen, tcount, erank, estart, eend
#         gc content, pfam domain id, protein id
    #get genomic length
    chrom = geneMultiValDict[gene][0][2]
    tss = geneMultiValDict[gene][0][5]
    strand = geneMultiValDict[gene][0][7]
    name = geneMultiValDict[gene][0][8]
    transCount = geneMultiValDict[gene][0][10]
    start = min([int(geneMultiValDict[gene][0][4]),int(geneMultiValDict[gene][0][3])])
    end = max([int(geneMultiValDict[gene][0][4]),int(geneMultiValDict[gene][0][3])])
    genLen = abs(end-start) 
    #get longest transcript
    transPlusLen = [(x[0],int(x[6])) for x in geneMultiValDict[gene]]
    longestTrans = max(transPlusLen, key=lambda x: x[1])[0]
    longestTransLen = max(transPlusLen, key=lambda x: x[1])[1]
    #get protein for longest CDS
    protPlusLen = [(x[1],int(x[9])) for x in geneMultiValDict[gene] if x[1] != '']
    longestProt = max(protPlusLen, key=lambda x: x[1])[0]
    longestProtLen = max(protPlusLen, key=lambda x: x[1])[1]
    #get max exon rank for number of exons (number of introns = this -1)
    intCount = max([int(x[11]) for x in geneMultiValDict[gene] if x[0] == longestTrans]) -1
    #intron avg len and coverage for longest transcript
    #get all exon coords for longest transcript
    exonList= [(int(x[12]),int(x[13])) for x in geneMultiValDict[gene] if x[0] == longestTrans]
    count = 0
    intLens = []
    exonList = sorted(exonList, key=lambda x: x[0])
    for start2, end2 in exonList:
        if count == 0: #first exon
            nextStart = end2
            count += 1
            continue
#             elif count == len(exonList) -1: #last exon
#                 intEnd = 
        else:
            intStart = nextStart
            intEnd = start2
            nextStart = end2
        
        intLens.append(abs(intEnd-intStart))
        count += 1
    intCount = len(intLens)
    intCov = sum(intLens)/genLen
    intAvg = np.mean(intLens)
    
    gc = geneMultiValDict2[gene][0][0]
    domainList = [x[1] for x in geneMultiValDict2[gene] if x[2] == longestProt]
    domainCount = len(domainList)
    uniqDomainCount = len(set(domainList))
            
# also any that need the API -gc3
    try:
        res = requests.get('https://rest.ensembl.org/sequence/id/'+longestTrans+'?type=cds',headers=headers).json()
    except Exception as e: #connection errors, try waiting a little and going again, but give up after that
        print(e)
        time.sleep(5)
        try:
            res = requests.get('https://rest.ensembl.org/sequence/id/'+longestTrans+'?type=cds',headers=headers).json()
        except Exception as e:
            print(e)
            print(gene,': failed to fetch sequence')
            continue
    try:
        seq = res['seq']
    except KeyError: #transcript not found, check to see if more than one exists for this gene
        if len(transPlusLen) == 1:
            continue #only one transcript and it's not found
        else:
            tried = [longestTrans]
            while len(tried) < len(transPlusLen):
                longestTrans = max([x for x in transPlusLen if x not in tried], key=lambda x: x[1])[0]
                longestTransLen = max([x for x in transPlusLen if x not in tried], key=lambda x: x[1])[1]
            
                res = requests.get('https://rest.ensembl.org/sequence/id/'+longestTrans+'?type=cds',headers=headers).json()
                try:
                    seq = res['seq']
                    break
                except KeyError:
                    tried.append(longestTrans)
                    continue
            else:
                continue
    pos3 = seq[2::3]
    gc3 = (len([x for x in pos3 if x == 'G' or x == 'C'])/len(pos3))*100
    insert = 'INSERT INTO gene_features VALUES(' + ','.join(['?' for x in range(20)]) + ')'
    cursor.execute(insert,(gene,longestTrans,longestProt,name,chrom,start,end,strand,tss,genLen,longestProtLen,int(longestTransLen),
                        transCount,intCount,intCov,float(intAvg),float(gc),gc3,domainCount,uniqDomainCount))

    db.commit()

In [None]:
cursor.execute('''ALTER TABLE gene_features ADD COLUMN motif_number_1k INTEGER''')
cursor.execute('''SELECT id FROM gene_features WHERE motif_number_1K IS NULL''')
ids = [x[0] for x in cursor.fetchall()]

counter = 0
t = time.time()

for i in ids:
    counter += 1
    cursor.execute('''SELECT chrom, start, end FROM gene_features WHERE id ==?''', (i,))
    l = cursor.fetchall()[0]
    loc = str(l[0]) + ':' + str(int(l[1])-1000) + '-' + str(int(l[2])+1000)
    print('Going to make request')
    url = "http://rest.ensembl.org/overlap/region/human/" + loc + "?feature=motif"
    headers = { "Content-Type" : "application/json"}
    r = requests.get(url, headers=headers).json()
    print(r)
    break

    if r != []:
        cursor.execute('''UPDATE gene_features SET motif_number_1k =? WHERE id == ?''', (len(r), i))
    else:
        cursor.execute('''UPDATE gene_features SET motif_number_1k = 0 WHERE id == ?''', (i,))
#     time.sleep(0.07)
    if counter% 200 == 0:
        print(counter, time.time()-t, 'seconds since started')

In [None]:
#evolution rate
cursor.execute('CREATE TABLE IF NOT EXISTS macaqueOrthologs(id TEXT, m_id TEXT, orthoType TEXT, dN REAL, dS REAL)')
with open('macaqueOrthoV99.txt','r') as file:
    file.readline()
    for line in file:
        line = line.strip('\n').split('\t')
        i, mi, t, dn,ds = line
        cursor.execute('INSERT INTO macaqueOrthologs VALUES(?,?,?,?,?)',(i,mi,t,dn,ds))
db.commit()

cursor.execute('ALTER TABLE gene_features ADD COLUMN evolRate REAL')
cursor.execute('''UPDATE gene_features 
                SET evolRate = 
               (SELECT AVG(dN/dS) FROM
                   macaqueOrthologs
                   WHERE gene_features.id == macaqueOrthologs.id AND dS >= 0.01)''')
db.commit()

In [None]:
# other features from papers
# expression - GTEX medians and other expression sources (HBDR and FANTOM)
cursor.execute("SELECT id FROM gene_features")
ids = [x[0] for x in cursor.fetchall()]
print('Reading in expression datasets')
with open('GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct', 'r') as med_file, \
    open('devExp.tpms.tsv','r') as dev_file, open('devExpFantom.tpms.tsv', 'r') as f_dev_file, \
    open('brainDev.tpms.tsv', 'r') as brain_file:
    exp_dict = {}
    for i in range(2):
        med_file.readline()
    
    header = med_file.readline().strip().split('\t')
    for line in med_file:
        line = line.strip().split('\t')
        gene = re.search(r'(.*)\.(.*)', line[0]).group(1) #remove version number on gene ID
        if gene in ids:
            exp_dict[gene] = [x for x in zip(header[2:],[float(x) for x in line[2:]])]
    for i in range(1,5):
        f_dev_file.readline()
    header = f_dev_file.readline().strip().split('\t')
    f_list = []
    for name in header:
        if 'fetal' in name:
            f_list.append(header.index(name))
    for line in f_dev_file:
        line = line.strip('\n').split('\t')
        if line[0] in ids:
            start = line[0:2]
            start.extend([0 if x == '' else float(x) for x in line[2:]])
            line = start
            for i in f_list:
                try:
                    exp_dict[line[0]].append((header[i],line[i]))
                except (KeyError, IndexError):
                    exp_dict[line[0]] = [(header[i],line[i])]
    
    for i in range(1,5):
        brain_file.readline()
    header = brain_file.readline().strip('\n').split('\t')
    for line in brain_file:
        line = line.strip('\n').split('\t')
        if line[0] in ids:
            start = line[0:2]
            start.extend([0 if x == '' else float(x) for x in line[2:]])
            line = start
            for i in range(2, len(line)):
                try:
                    exp_dict[line[0]].append((header[i],line[i]))
                except KeyError:
                    exp_dict[line[0]] = [(header[i],line[i])]
    for i in range(4):
        dev_file.readline()
    header = dev_file.readline().strip().split('\t')
    d_list = []
    for name in header:
        if 'conception' in name:
            d_list.append(header.index(name))
    print(d_list)
    for line in dev_file:
        line = line.strip('\n').split('\t')
        if line[0] in ids:
            start = line[0:2]
            start.extend([0 if x == '' else float(x) for x in line[2:]])
            line = start
            for i in d_list:
                try:
                    exp_dict[line[0]].append((header[i],line[i]))
                except (KeyError, IndexError):
                    exp_dict[line[0]] = [(header[i],line[i])]
    
for k in [x for x in exp_dict.keys()][:5]:
        print(k,exp_dict[k])
count = 0
print('Updating table now! Expression values')
cursor.execute('ALTER TABLE gene_features ADD COLUMN max_exp REAL')
cursor.execute('ALTER TABLE gene_features ADD COLUMN max_tissue TEXT')
cursor.execute('''SELECT id FROM gene_features''')
ids = [x[0] for x in cursor.fetchall()]
timer = time.time()
for i in ids:
    try:
        expression_list = exp_dict[i]
#         expression_list = [float(x[]) for x in expression_list]
        mT, mE = max(expression_list,key=lambda x: x[1])
        cursor.execute('''UPDATE gene_features SET max_exp = ?, max_tissue = ? WHERE id == ?''', (mE,mT,i))
        count += 1
        if count % 500 == 0:
            print(count, time.time()-timer)
    except KeyError:
        count += 1
        if count % 500 == 0:
            print(count, time.time()-timer)
        continue
db.commit()
# Specificity
def tau(expression_list):
    expression_list = [float(x[1]) for x in expression_list]
    m = max(expression_list)
    try:
        expression_list = [x/m for x in expression_list]
    except ZeroDivisionError:
        return(None)
    num_list = []
    for i in expression_list:
        num_list.append(1-i)
    s = sum(num_list)
    try:
        res = s/ (len(expression_list)-1)
    except ZeroDivisionError:
        print(expression_list)
        return(None)
    return(res)
cursor.execute('''ALTER TABLE gene_features ADD COLUMN specificity REAL''')
count = 0
timer = time.time()
for k in exp_dict:
    expressed = 0
    for x in exp_dict[k]:
        x = x[1]
        if float(x) >= 1:
            expressed = 1
    if expressed == 1:
        t = tau(exp_dict[k])
        cursor.execute('''UPDATE gene_features SET specificity = ? WHERE id == ?''', (t, k))
    count += 1
    if count % 500 == 0:
        print(count,time.time()-timer)
db.commit()
print('Essentiality')
# CRISPR score - Wang et al
#essentiality
cursor.execute('ALTER TABLE gene_features ADD COLUMN ess REAL')
with open('Wang_CRISPR.csv', 'r') as file:
    file.readline()
    for line in file:
        l = line.strip().split(',')
        geneName, KBM7, K562, Jiyoye, Raji = l[0], float(l[2]), float(l[4]), float(l[6]), float(l[8])
        scoreList = [KBM7, K562, Jiyoye, Raji]
        ess = min(scoreList)
        cursor.execute('''UPDATE gene_features SET ess = ? WHERE name == ?''', 
                       (ess, geneName))
        db.commit()
# PPIs: human interactome data HuRI_2_10_2019.tsv
print('PPIs')
cursor.execute('CREATE TABLE IF NOT EXISTS interactions(id1 TEXT,id2 TEXT)')
cursor.execute('ALTER TABLE gene_features ADD COLUMN PPIs REAL')
with open('HuRI_2_10_2019.tsv','r') as file:
    file.readline()
    for line in file:
        line = line.strip('\n').split('\t')
        id1, id2 = line[:2]
        cursor.execute('INSERT INTO interactions VALUES(?,?)',(id1,id2))
db.commit()
cursor.execute('UPDATE gene_features SET PPIs = (SELECT COUNT(*) FROM interactions WHERE (id1 == id) OR (id2 == id))')
db.commit()
print('Pop based essentiality')
# # EvoTol
cursor.execute('ALTER TABLE gene_features ADD COLUMN EvoTol REAL')
cursor.execute('SELECT name from gene_features')
names = [x[0] for x in cursor.fetchall()]
for i in names:
    url = 'http://www.evotol.co.uk/genes_json?genes=' + i + '&ont=all&thresh=1'
    try:
        res = requests.get(url).json()
        if res['table'] == []:
            score = None
        else:
            score = res['table'][0][1]
    except ConnectionError:
        score = None
    time.sleep(0.6)
    cursor.execute('UPDATE gene_features SET EvoTol = ? WHERE name == ?',(score,i))
db.commit()
# LoFTool
cursor.execute('ALTER TABLE gene_features ADD COLUMN loftool_percentile REAL')
with open('loftool_scores_table.csv','r') as file:
    file.readline()
    for line in tqdm(file):
        line = line.strip('\n').split(',')
        name, score = line[0],float(line[3])
        cursor.execute('UPDATE gene_features SET loftool_percentile = ? WHERE name == ?',(score,name))
db.commit()
# Phi
cursor.execute('ALTER TABLE gene_features ADD COLUMN Phi REAL')
with open('phi_scores.txt','r') as file:
    for i in range(0,8):
        file.readline()
    for line in file:
        line = line.strip('\n').split(' ')
        name,score = line[0],line[1]
        cursor.execute('UPDATE gene_features SET Phi = ? WHERE name == ?',(score,name))
db.commit()
# pLI and missense Z score in one file from gnomad
cursor.execute('ALTER TABLE gene_features ADD COLUMN syn_Z_score REAL')
cursor.execute('ALTER TABLE gene_features ADD COLUMN mis_Z_score REAL')
cursor.execute('ALTER TABLE gene_features ADD COLUMN lof_Z_score REAL')
cursor.execute('ALTER TABLE gene_features ADD COLUMN pLI_score REAL')
headers = {'Content-Type':'application/json'}
with open('gnomad.v2.1.1.lof_metrics.by_gene.txt','r') as file:
#gene id in col 64 (ind 63) for some godforsaken reason
# ind 31,32,33 = z scores for syn, missense and LOF
# ind 20 is pLI, prob intolerant of het and homo LOF
    file.readline()
    for line in file:
        line = line.strip('\n').split('\t')
        gene = line[63]
        z_syn, z_miss, z_lof, pli = line[31],line[32],line[33],line[20]

        cursor.execute('UPDATE gene_features SET syn_Z_score = ?, mis_Z_score = ?,lof_Z_score = ?, pLI_score = ? WHERE id == ?',(z_syn,z_miss,z_lof,pli,gene))
db.commit()
# Shet
cursor.execute('ALTER TABLE gene_features ADD COLUMN s_het REAL')
with open('shet_estimates.csv','r') as file:
    file.readline()
    for line in file:
        line = line.strip('\n').split(',')
        name, shet = line[:2]
        cursor.execute('UPDATE gene_features SET s_het = ? WHERE name == ?',(shet,name))
db.commit()
# RVIS
cursor.execute('ALTER TABLE gene_features ADD COLUMN RVIS REAL')
with open('GenicIntolerance_v3_12Mar16.txt','r') as file:
    file.readline()
    for line in tqdm(file):
        line = line.strip('\n').split('\t')
        name,score,excl = line[0],line[2],line[16]
        if excl == 'Y':
            continue
        else:
            cursor.execute('UPDATE gene_features SET RVIS = ? WHERE name == ?',(score,name))
db.commit()
print('Done!')

In [None]:
#features requiring external calculations
# CAI
cursor.execute('ALTER TABLE gene_features ADD COLUMN CAI REAL')
cursor.execute('SELECT id,trans_id FROM gene_features')
cList = cursor.fetchall()
h= {"Content-Type" : "text/plain"}
print('Fetching sequences')
done = []
with open('cdsSequence.fa','r') as file:
    for line in file:
        if line.startswith('>'):
            i = line.strip('>').strip('\n')
            done.append(i)
counter = 0
error = []
with open('cdsSequence.fa','w') as file:
    for ID,transID in cList:
        if ID in done:
            continue
        url = "https://rest.ensembl.org/sequence/id/" + transID + "?type=cds;multiple_sequences=1"
        try:
            r = requests.get(url, headers=h).text
        except Exception as e:
            print("Error {}".format(e))
            error.append(ID)

        #get longest CDS, multiple sequences are returned separated by newline
        cds = max(r.split('\n'), key=len)
        file.write('>'+ID+'\n')
        file.write(cds+'\n')
        counter += 1
        time.sleep(0.07)
        if counter%100 == 0:
            print(counter,'done')
print('IDs not completed:')
print(error)
print('Calculating CAI')
cmd = 'perl CAIcal_ECAI_v1.4_source/CAIcal_ECAI_v1.4.pl -f cdsSequence.fa -h CAIcal_ECAI_v1.4_source/human -o1 totalCAI.txt -o2 totalRandSeq.txt -o3 totalExpectCAI.txt'.split(' ')
p = Popen(cmd ,stdin=PIPE, stdout=PIPE, stderr=PIPE)
out, err = p.communicate()
print('Updating table')
cursor.execute('SELECT id FROM gene_features WHERE CAI IS NULL')
doneIDs = [x[0] for x in cursor.fetchall()]
with open('totalCAI.txt','r') as file:
    file.readline()
    for line in tqdm(file):
        line = line.strip('\n').strip('>').split('\t')
        if line[0] in doneIDs:
            continue
        if 'Error' in line[1]:
            line[1] = 'Error-check'
        cursor.execute('UPDATE gene_features SET CAI = ? WHERE id == ?',(line[1],line[0]))
db.commit()
print()
#IDRs
cursor.execute('ALTER TABLE gene_features ADD COLUMN IntDisProp REAL')
cursor.execute('SELECT id,trans_id FROM gene_features WHERE IntDisProp IS NULL')
idList = [x for x in cursor.fetchall()]
print('Calculating intrinsic disorder')
for i,ID in idList:
    disCount = 0
    url = "https://rest.ensembl.org/sequence/id/" + ID + "?type=protein;multiple_sequences=1"
    r = requests.get(url, headers=h).text
    #get longest CDS, multiple sequences are returned separated by newline
    prot = max(r.split('\n'), key=len)
    protLen = len(prot)
    Input = '>' + i + '\n' + prot +'\n'
    with open('tempProt.fa','w') as file:
        file.write(Input)
    cmd = 'python IUPRED/iupred2a.py tempProt.fa long'.split(' ')
    p = Popen(cmd,stdin=PIPE, stdout=PIPE, stderr=PIPE)
    out, err = p.communicate()
    for line in out.decode().split('\n'):
        if line.startswith('#'):
            continue
        else:
            line = line.split('\t')
            if line != ['']:
                intDis = line[2]
            else:
                continue
            if float(intDis) >= 0.5:
                disCount += 1
    cursor.execute('UPDATE gene_features SET IntDisProp = ? WHERE id == ?',(disCount/protLen,i))
cursor.execute('UPDATE paralogy SET preVertebrata = "F" WHERE preVertebrata IS NULL')
db.commit()
print('Done!')

# Duplication category assignment

In [None]:
# creating of paralog table
cursor.execute('CREATE TABLE IF NOT EXISTS paralogy(id TEXT, para TEXT, age TEXT,dN REAL,dS REAL,singleton TEXT, retro TEXT, preVertebrata TEXT, ohno_2010 TEXT, ohno_singh_1 TEXT, ohno_singh2 TEXT, ohno_sing3 TEXT, ohno_2020 TEXT)')
old_tax_list = ['Opisthokonta','Animalia','Bilateria', 'Chordata'] 
new_tax_list = ['Vertebrata','Gnathostomata','Euteleostomi', 'Sarcopterygii', 'Tetrapoda', 
                'Amniota', 'Mammalia', 'Theria', 'Eutheria','Boreoeutheria',  
                'Euarchontoglires', 'Primates', 'Haplorrhini', 'Simiiformes',
                'Catarrhini', 'Hominoidea', 'Hominidae','Homininae', 'Homo sapiens']
insList, insListPreV, insListSing = [],[],[]
with open('ensemblParalogsv99_2.txt','r') as file:
    file.readline()
    for line in tqdm(file):
        line = line.strip('\n').split('\t')
        i, p, age,dn,ds = line
        if age in old_tax_list:
            insListPreV.append((i,p,age,dn,ds,'T')) # pre-vertebrate duplication
        elif p == '': #singleton - no paralog
            insListSing.append((i,'T'))
        else:
            insList.append((i,p,age,dn,ds))
print('Running inserts...')
cursor.executemany('INSERT INTO paralogy(id,para,age,dN,dS,preVertebrata) VALUES(?,?,?,?,?,?)',insListPreV)
cursor.executemany('INSERT INTO paralogy(id,singleton) VALUES (?,?)',insListSing)
cursor.executemany('INSERT INTO paralogy(id,para,age,dN,dS) VALUES(?,?,?,?,?)',insList)
db.commit()

In [None]:
#assign ohnolog status to each paralog pair in paralogy table
cursor.execute('''UPDATE paralogy 
                   SET ohno_2020 == "T" 
                       WHERE EXISTS 
                           (SELECT * FROM ohnologs_2020 AS o 
                               WHERE (o.id == paralogy.id AND o.id2 == paralogy.para) OR (o.id == paralogy.para AND o.id2 == paralogy.id))''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_2010 == "T" 
                       WHERE EXISTS 
                           (SELECT * FROM ohnologs_2010 AS o 
                           WHERE (o.id1 == paralogy.id AND o.id2 == paralogy.para) OR (o.id1 == paralogy.para AND o.id2 == paralogy.id))''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_singh_1 == "T" 
                       WHERE EXISTS 
                           (SELECT * FROM ohnologs_singh1 AS o 
                               WHERE (o.id1 == paralogy.id AND o.id2 == paralogy.para) OR (o.id1 == paralogy.para AND o.id2 == paralogy.id))''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_singh2 == "T" 
                       WHERE EXISTS 
                           (SELECT * FROM ohnologs_singh2 AS o 
                               WHERE (o.id1 == paralogy.id AND o.id2 == paralogy.para) OR (o.id1 == paralogy.para AND o.id2 == paralogy.id))''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_sing3 == "T" 
                       WHERE EXISTS 
                           (SELECT * FROM ohnologs_singh3 AS o 
                               WHERE (o.id1 == paralogy.id AND o.id2 == paralogy.para) OR (o.id1 == paralogy.para AND o.id2 == paralogy.id))''')
#add in F for not ohnologous pairs because I don't want to re-write the queries that use that
cursor.execute('''UPDATE paralogy 
                   SET ohno_2020 == "F" 
                    WHERE ohno_2020 IS NULL''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_2010 == "F" 
                    WHERE ohno_2010 IS NULL''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_singh_1 == "F" 
                    WHERE ohno_singh_1 IS NULL''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_singh2 == "F" 
                    WHERE ohno_singh2 IS NULL''')
cursor.execute('''UPDATE paralogy 
                   SET ohno_sing3 == "F" 
                    WHERE ohno_sing3 IS NULL''')

# assign retroduplication status for all pairs
cursor.execute('''SELECT id, para FROM paralogy WHERE (singleton != "T" OR singleton IS NULL) AND retro IS NULL''')
paraRes = cursor.fetchall()
print(len(paraRes))

cursor.execute('SELECT id, chrom FROM gene_features')
chromDict = dict(cursor.fetchall())
for key in chromDict:
    chromDict[key] = str(chromDict[key])

startTime = time.time()
doneCount = 0

# work by pair
syntenyCases = 0
for i, p in paraRes:
   
    localSynteny = False
    doneCount += 1
    cursor.execute('SELECT intCount,id FROM gene_features WHERE id == ? OR id == ?',(i,p))
    res = cursor.fetchall()
    try:
        count1 = [x[0] for x in res if x[1] == i][0]
        count2 = [x[0] for x in res if x[1] == p][0]
    except IndexError:
        continue #missing id, likley non-coding paralog

#     check if meets the retro test off the bat: one has 0 introns and the other has at least 3
    if (count1 == 0 or count2 == 0) and (count1 >= 3 or count2 >= 3): 
        cursor.execute('UPDATE paralogy SET retro == "T" WHERE id == ? AND para == ?' ,(i,p))
        continue
        
#     if doesn't meet the retro requirement but needs more investigation due to low intron counts: synteny
    elif (count1 == 0 and count2 == 0) or ((count1 == 0 or count2 == 0) and (count1 < 3 or count2 < 3)):  
        # double zero pairs, or one zero and they other less than 3
        syntenyCases += 1
        try:
            c1 = chromDict[i]
            c2 = chromDict[p]
        except KeyError:
            #missingPairs.append((i,p)) #this is prob an issue if I'm going to use non-coding pairs, I guess they go in the 'full_paralogs' table?
            continue

#         cursor.execute('SELECT DISTINCT id, gene_start, gene_end, chr FROM full_paralogs WHERE chr == ? OR chr == ?', (c1,c2))
        cursor.execute('SELECT id, start, end, chrom FROM gene_features WHERE chrom == ? or chrom == ?',(c1,c2))
        res = cursor.fetchall()
        locList1 = [x[:3] for x in res if x[3] == c1]
        locList2 = [x[:3] for x in res if x[3] == c2]

        locList1 = [x if (x[2] > x[1]) else (x[0],x[2],x[1]) for x in locList1]
        locList2 = [x if (x[2] > x[1]) else (x[0],x[2],x[1]) for x in locList2] #getting lists of genes on chromosome, making sure start is lower number

        orderList1 = [x[0] for x in sorted(locList1, key= lambda x: x[1])]
        orderList2 = [x[0] for x in sorted(locList2, key= lambda x: x[1])] #ordered lists of genes on the relevant chromosomes

        try:
            ind1 = orderList1.index(i)
        except:
            print(i, type(c1), doneCount)
        #checks for the location of the gene on the chromosome
        if ind1 < 5:
            geneListLeft1 = orderList1[0:ind1]
        elif ind1 > len(orderList1)-6: # -6 because of 0 indexing
            geneListRight1 = orderList1[ind1+1:]

        else:
            startInd1 = ind1 - 5
            endInd1 = ind1 + 6 #because of 'up to but not including'
        
            geneListLeft1 = orderList1[startInd1:ind1]
            geneListRight1 = orderList1[ind1+1:endInd1]
        #same checks but for the paralog
        ind2 = orderList2.index(p)
        if ind2 < 5:
            geneListLeft2 = orderList2[0:ind2]
            
        elif ind2 > len(orderList2)-6: # -6 because of 0 indexing
            geneListRight2 = orderList2[ind2+1:]
            
        else:
            startInd2 = ind2 - 5
            endInd2 = ind2 + 6 #because of 'up to but not including'

            geneListLeft2 = orderList2[startInd2:ind2]
            geneListRight2 = orderList2[(ind2+1):endInd2]

        geneListLeft1.extend(geneListRight1) #not bothered properly changing to mashed together list
        geneListLeft2.extend(geneListRight2) #lists of all the genes to be considered on either side of the pair in question

        matchList = []

        for a,b in itertools.product(geneListLeft1,geneListLeft2):
            if (a,b) in set(paraRes) or (b,a) in set(paraRes):
                #genes are paralogous
                if not((a,b) in matchList or (b,a) in matchList):
                    matchList.append((a,b))
    
    # iterate over pairs in gene lists testing if they have matches
        for j in range(0,len(geneListLeft1)-1):
            gene1, gene2 = geneListLeft1[j], geneListLeft1[j+1]
            # do both have a match
            if gene1 in flatten(matchList) and gene2 in flatten(matchList):
        #         get all the matches for these two genes and see are any collinear in geneListLeft2 specifically
        #         this is messier than it should be because of genes in tandem - some ids will appear in both lists
                gene1Matches = [x[0] for x in matchList if (x[1] == gene1 and x[0] in geneListLeft2)]
                gene1Matches.extend([x[1] for x in matchList if (x[0] == gene1 and x[1] in geneListLeft2)])

                gene2Matches = [x[0] for x in matchList if (x[1] == gene2 and x[0] in geneListLeft2)]
                gene2Matches.extend([x[1] for x in matchList if (x[0] == gene2 and x[1] in geneListLeft2)])

                # are their matches colinear (in the other gene order so geneListLeft2)
                for gene1Match, gene2Match in itertools.product(gene1Matches,gene2Matches):
                    match1Order, match2Order = geneListLeft2.index(gene1Match), geneListLeft2.index(gene2Match)
                    if abs(match1Order - match2Order) == 1:
                        cursor.execute('UPDATE paralogy SET retro = "F" WHERE id == ? AND para == ?', (i,p))
                        break
                else:
                    continue #continue to next j only if no match found, if the for breaks we break out of the outer loop too
                break
        else:
#           doesn't meet retro requirement and lacks synteny, inconclusive
            cursor.execute('UPDATE paralogy SET retro = "I" WHERE id == ? AND para == ?', (i,p))
    else:
#         is a tandem pair: too many introns in both to call it anything else
        cursor.execute('UPDATE paralogy SET retro = "F" WHERE id == ? AND para == ?', (i,p))
    

    if doneCount % 1000 == 0:
        print(doneCount, round(time.time()-startTime,2))
        print(syntenyCases)
        db.commit()
db.commit()

In [None]:
# sorting into duplicate categories
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_2010 TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_singh1 TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_singh2 TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_singh3 TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_2020 TEXT')

# cursor.execute('UPDATE gene_features SET dupCat_2010 = ?',(None,))
# cursor.execute('UPDATE gene_features SET dupCat_singh1 = ?',(None,))
# cursor.execute('UPDATE gene_features SET dupCat_singh2 = ?',(None,))
# cursor.execute('UPDATE gene_features SET dupCat_singh3 = ?',(None,))
# cursor.execute('UPDATE gene_features SET dupCat_2020 = ?',(None,))

count = 0
t = time.time()

# cursor.execute('CREATE INDEX gene ON gene_features(id)')
cursor.execute('SELECT id FROM gene_features WHERE dupCat_2020 IS NULL')
idList = [x[0] for x in cursor.fetchall()]
for i in idList:
    count += 1
    #fetch ohnolog/singleton/retro/preVert status
    cursor.execute('SELECT ohno_2010, ohno_singh_1, ohno_singh2, ohno_sing3, ohno_2020, singleton,retro, preVertebrata FROM paralogy WHERE id == ?', (i,))
    res = cursor.fetchall()
    
    ohno2010Res = [(x[0],x[7]) for x in res]
    ohnoS1Res = [(x[1],x[7]) for x in res]
    ohnoS2Res = [(x[2],x[7]) for x in res]
    ohnoS3Res = [(x[3],x[7]) for x in res]
    ohno2020Res = [(x[4],x[7]) for x in res]
    singRes = [x[5] for x in res]
    retroRes = [x[6] for x in res]
    
# Initial check for singletons and then classify depending on the types of duplications the gene is involved in

    if 'T' in set(singRes):
#         singleton
        cursor.execute('UPDATE gene_features SET dupCat_2010 = "singleton" WHERE id == ?',(i,))
        cursor.execute('UPDATE gene_features SET dupCat_singh1 = "singleton" WHERE id == ?',(i,))
        cursor.execute('UPDATE gene_features SET dupCat_singh2 = "singleton" WHERE id == ?',(i,))
        cursor.execute('UPDATE gene_features SET dupCat_singh3 = "singleton" WHERE id == ?',(i,))
        cursor.execute('UPDATE gene_features SET dupCat_2020 = "singleton" WHERE id == ?',(i,))
        
        continue

    
    for data, col in [(ohno2010Res,'dupCat_2010'),(ohnoS1Res,'dupCat_singh1'),
                      (ohnoS2Res,'dupCat_singh2'),(ohnoS3Res,'dupCat_singh3'),
                      (ohno2020Res, 'dupCat_2020')]:
        if all([x == ('F','T') for x in data]): #all dups are pre-Vert and not ohnos:
            cursor.execute('UPDATE gene_features SET '+col+' = "Singleton (Vert)" WHERE id== ?',(i,))
            continue
        #at this point I can probably just keep ohnologs and Vert lineage SSDs?
        data = [x[0] for x in data if (x[0] == 'T') or (x[1] == 'F')]
        if ('T' in set(data)) and ('F' in set(data)): # mix of ohno and SSD
            if 'T' in set(retroRes): #check if there are any retros in with the ohnologs before assigning as mixed
                cursor.execute('UPDATE gene_features SET '+col+' = "mix-retros" WHERE id== ?',(i,))
            else:
                cursor.execute('UPDATE gene_features SET '+col+' = "mix" WHERE id == ?',(i,))
        elif not ('F' in set(data)):
            # pure ohnolog
            cursor.execute('UPDATE gene_features SET '+col+' = "WGD" WHERE id == ?', (i,))
        # no ohnologs, check if any retro/inconclusive
        elif not ('T' in set(data)):
            if (not 'T' in set(retroRes)) and (not 'I' in set(retroRes)): #no retro dups or inconclusive ones
                cursor.execute('UPDATE gene_features SET '+col+' = "SSD" WHERE id == ?', (i,))
            elif (not 'F' in set(retroRes)) and (not 'I' in set(retroRes)): #all the duplications are retro dups
                cursor.execute('UPDATE gene_features SET '+col+' = "Retrogene" WHERE id == ?', (i,))
            else: #a mixture of SSDs and retro dups/inconclusive 
                cursor.execute('UPDATE gene_features SET '+col+' = "mix - no ohnos" WHERE id == ?', (i,))
      
    if count % 500 ==  0:
        print(count, round(time.time()-t,2))
        
db.commit() 

In [None]:
#integrating all datasets 2010, Singh relaxed and current
#majority rules and unanimous columns
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_maj TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN dupCat_all TEXT')
incCols = ','.join(['dupCat_2010','dupCat_singh1','dupCat_2020_8'])
cursor.execute('SELECT id, ' + incCols + ' FROM gene_features')
for res in cursor.fetchall():
    i = res[0]
    dupCats = res[1:]
    maj = Counter(dupCats).most_common(1)[0][0]
    cursor.execute('UPDATE gene_features SET dupCat_maj = ? WHERE id == ?',(maj,i))
    if len(set(dupCats)) == 1: #all cols agree on one category
        cursor.execute('UPDATE gene_features SET dupCat_all = ? WHERE id == ?',(dupCats[0],i))
db.commit()

# Duplicate age assignment

In [None]:
# age assignment - maybe do node and dS
# then need to assign numeric values for the nodes
# dict of lists for each feature
# featureListsDict = {'glen' :[[],[]],

# dict of age estimates for divergence times
nodeDivAge = {'Opisthokonta':1105/50,
             'Bilateria':824/50,
             'Chordata':684/50,
             'Vertebrata':615/50,
             'Gnathostomata':473/50,
             'Euteleostomi':435/50,
             'Sarcopterygii':413/50,
             'Tetrapoda':352/50,
             'Amniota':312/50,
             'Mammalia':177/50,
             'Theria':159/50,
             'Eutheria':105/50,
             'Boreoeutheria':96/50,
             'Euarchontoglires':90/50,
             'Primates':74/50,
             'Haplorrhini':67/50,
             'Simiiformes':43/50,
             'Catarrhini':29.4/50,
             'Hominoidea':20.2/50,
             'Hominidae':16.8/50,
             'Homininae':9.1/50,
             'Homo sapiens':6.7/50}

# select all genes that have vertebrate duplications
cursor.execute('ALTER TABLE gene_features ADD COLUMN youngestNode TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN oldestNode TEXT')
cursor.execute('ALTER TABLE gene_features ADD COLUMN youngestdS REAL')
cursor.execute('ALTER TABLE gene_features ADD COLUMN oldestdS REAL')
cursor.execute('SELECT id FROM gene_features WHERE dupCat_2010 != "singleton"')
idList = [x[0] for x in cursor.fetchall()]
for i in idList:
    cursor.execute('SELECT age,dS FROM paralogy WHERE id == ?',(i,)) 
    res = cursor.fetchall()
    dupAgeList = [x[0] for x in res]
    dupDsList = [x[1] for x in res if x[1] != '']
    try:
        youngestDS = min(dupDsList)
        youngestNode = min(dupAgeList, key = lambda x: nodeDivAge[x])
        oldestNode = max(dupAgeList, key = lambda x: nodeDivAge[x])
        oldestDS = max(dupDsList)
        cursor.execute('UPDATE gene_features SET youngestdS = ?, oldestdS = ?, youngestNode = ?, oldestNode = ? WHERE id == ?',(youngestDS, oldestDS, youngestNode,oldestNode,i))
    except ValueError:
        youngestDS = None
        youngestNode = min(dupAgeList, key = lambda x: nodeDivAge[x])
        oldestNode = max(dupAgeList, key = lambda x: nodeDivAge[x])
        oldestDS = None
        cursor.execute('UPDATE gene_features SET youngestdS = ?, oldestdS = ?, youngestNode = ?, oldestNode = ? WHERE id == ?',(youngestDS, oldestDS, youngestNode,oldestNode,i))
db.commit()

# Pairwise comparisons and plots

In [None]:
# function for grabbing data from database for each feature
def getData(feature, dupCat,dupData, transform=None, Filter = None):
    from math import log10
    if not Filter and dupCat == '"singleton"':
        query = 'SELECT ' + feature + ' FROM gene_features WHERE (' + dupData + ' == "singleton" OR '+ dupData + ' == "Singleton (Vert)") AND NOT ' + feature + ' IS NULL'
    elif not Filter:
        query = 'SELECT ' + feature + ' FROM gene_features WHERE ' + dupData + ' == ' + dupCat + ' AND NOT ' + feature + ' IS NULL' 
    elif dupCat == '"singleton"':
        query = 'SELECT ' + feature + ' FROM gene_features WHERE (' + dupData + ' == "singleton" OR '+ dupData +' == "Singleton (Vert)") AND NOT ' + feature + ' IS NULL AND ' + Filter
    else:
        query = 'SELECT ' + feature + ' FROM gene_features WHERE ' + dupData +' == ' + dupCat + ' AND ' + Filter
#     print(query)
    cursor.execute(query)
    res1 = [x[0] for x in cursor.fetchall()]
    if transform == 'log10':
        res = [log10(x) for x in res1]
    elif transform == 'log10_zeroes':
        res = [log10(x+(min([y for y in res1 if y > 0]))/2) for x in res1]
    elif transform == 'neg':
        res = [-x for x in res1]
    elif transform == 'percent':
        res = [x*100 for x in res1]
    elif not transform:
        res = res1
    else:
        raise ValueError('Transform "'+transform+'" is not defined')
    return [x for x in res if x]

In [None]:
#generic plotting code
fig, ax = plt.subplots(1,1,figsize = (10,10))
data = np.linspace(1,100,num=30)
data2 = np.linspace(50,100,num=30)
data3 = np.linspace(1,75,num=30)
data4 = np.linspace(25,75,num=30)
box = ax.boxplot([data,data2, data3,data4],labels=['WGD','Mixed','SSD','Singleton'],patch_artist=True)
ax.yaxis.set_tick_params(labelsize=16)
ax.xaxis.set_tick_params(labelsize=16)
bCount = 0
for b in box['boxes']:
    bCount += 1
    if bCount ==1:
        b.set_facecolor((0.9,0.2,0,0.7))
    if bCount == 2:
        b.set_facecolor((0.9,0.3,0,0.4))
    if bCount == 3:
        b.set_facecolor((1,0.41,0,0.2))
        bCount = 0
    b.set_ec('k')
for m in box['medians']:
    m.set_lw(2)
    m.set_color('k')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.xaxis.set_ticks_position('none') 
ax.plot([1,1.95],[110,110],lw=1,color='k',zorder=1)
ax.plot([2.05,2.95],[110,110],lw=1,color='k',zorder=1)
ax.plot([3.05,4],[110,110],lw=1,color='k',zorder=1)
ax.plot([1,3],[120,120],lw=1,color='k',zorder=1)
ax.plot([2,4],[130,130],lw=1,color='k',zorder=1)
ax.plot([1,4],[140,140],lw=1,color='k',zorder=1)
ax.scatter([1,1.95,2.05,2.95,3.05,4,1,3,2,4,1,4],[110,110,110,110,110,110,120,120,130,130,140,140],color=(0.8,0,0,1),s=30,zorder=2)
plt.show()

In [None]:
#plotting: seq and structural features
#trying without the 'mixed' category
def scale_lightness(rgb, scale_l):
    # convert rgb to hls
    h, l, s = colorsys.rgb_to_hls(*rgb)
    # manipulate h, l, s values and return as rgb
    return colorsys.hls_to_rgb(h, min(1, l * scale_l), s = s)
def formatPval(list1,list2,correction):
    pval = (mannwhitneyu(list1,list2,alternative='two-sided').pvalue)*correction
    if pval >= 0.05:
        return 'ns'
    elif pval >= 0.001:
        return round(pval,3)
    else:
        valList = '{:.2e}'.format(pval).split('e')
        return valList[0]+'x 10$^{-'+valList[1].lstrip('-').lstrip('0')+'}$'
colUsed = 'dupCat_maj'
featList = ['genLen','cdsLen','intCount','intAvg',
                'intCov','CAI','IntDisProp','gc',
                'gc3','domains','u_domains']
namesDict = {'genLen':('Genomic Length','log(bp)'),'cdsLen':('CDS Length','log(bp)'),'intCount':('Intron Count','log(count)'),
             'intAvg':('Mean intron length','log(bp)'),'intCov':('Intron coverage','%'),'CAI':('Codon optimisation','CAI score'),
             'IntDisProp':('Intrinsic Disorder','%'),'gc':('GC content','%'),'gc3':('GC3 content','%'),
             'domains':('Total domains','log(count)'),'u_domains':('Unique domains','count')}
featDict = {}
for feature in featList:
    featDict[feature] = {}
    if feature == 'genLen' or feature == 'cdsLen' or feature == 'intAvg':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,transform='log10')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,transform='log10')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,transform='log10')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,transform='log10')
    elif feature == 'intCount' or feature == 'domains':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,transform='log10_zeroes')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,transform='log10_zeroes')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,transform='log10')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,transform='log10_zeroes')
    elif feature == 'CAI':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,Filter='CAI != "Error-check"')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,Filter='CAI != "Error-check"')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,Filter='CAI != "Error-check"')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,Filter='CAI != "Error-check"')
    elif feature == 'IntDisProp' or feature == 'intCov':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,transform='percent')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,transform='percent')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,transform='log10')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,transform='percent')
    else:
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed)
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed)
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed)
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed)
        
fig, axes = plt.subplots(4,3,figsize = (17,20))
flatAx = []
for axList in axes:
    flatAx.extend(axList)
for feature, ax in zip(featList,flatAx):
    if feature == 'intCount_b' or feature == 'domains_b':
        dataList = [featDict[feature]['WGD'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        if feature == 'intCount':
            max2, min2 = 205, 340
        elif feature == 'domains':
            max2, min2 = 105,310
        valRange = (maxVal-minVal) - (min2-max2)
        divider = make_axes_locatable(ax)
        perCentSize = str((((maxVal + 2.3*(valRange/5))-min2)/(max2-minVal))*100)+'%'
        ax2 = divider.new_vertical(size=perCentSize,pad='3%')
        fig.add_axes(ax2)
        for a in [ax,ax2]:
            box = a.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
            a.yaxis.set_tick_params(labelsize=14)
            a.xaxis.set_tick_params(labelsize=14)
            bCount = 0
            for b in box['boxes']:
                bCount += 1
                if bCount ==1:
                    b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
                if bCount == 2:
                    b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
                if bCount == 3:
                    b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                    bCount = 0
                b.set_ec('k')
            for m in box['medians']:
                m.set_lw(2)
                m.set_color('k')
            a.spines['top'].set_visible(False)
            a.spines['right'].set_visible(False)
            a.spines['bottom'].set_visible(False)
            a.xaxis.set_ticks_position('none') 
            a.plot([0.95,1.95],[maxVal + 0.9*(valRange/5),maxVal+0.9*(valRange/5)],lw=1,color='k',zorder=2)
            a.plot([2.05,3.05],[maxVal + 0.9*(valRange/5),maxVal+0.9*(valRange/5)],lw=1,color='k',zorder=2)
            #a.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
            a.plot([0.95,3.05],[maxVal + 2.1*(valRange/5),maxVal+2.1*(valRange/5)],lw=1,color='k',zorder=2)
            #a.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
            #a.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
            a.scatter([0.95,1.95,2.05,3.05,0.95,3.05],[maxVal+0.9*(valRange/5),maxVal+0.9*(valRange/5),
                                            maxVal+0.9*(valRange/5),maxVal+0.9*(valRange/5),
                                            maxVal +2.1*(valRange/5),maxVal+2.1*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax2.text(1.45,maxVal + 1.1*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=14,ha='center')
        ax2.text(2.55,maxVal + 1.1*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=14,ha='center')
        #ax2.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=14,ha='center')
        ax2.text(2,maxVal + 2.3*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax2.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=14,ha='center')
#         ax2.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=14,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=16)
        ax.yaxis.set_label_coords(-0.12, 0.8)
        ax.set_ylim(ymin=0,ymax=max2)
        ax.set_ylabel(namesDict[feature][1],fontsize=16)
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2)
        ax2.set_facecolor((0,0,0,0.05))
        ax2.grid(color='w',lw=2)
        ax2.vlines(x=1,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=2,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=3,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
#         ax2.vlines(x=4,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.set_ylim(ymin=min2,ymax=maxVal+3*(valRange/5))
#         ax2.tick_params(bottom="off", labelbottom='off')
        ax2.xaxis.set_visible(False)
# data = np.linspace(1,100,num=30)
# data2 = np.linspace(50,100,num=30)
# data3 = np.linspace(1,75,num=30)
    else:
        dataList = [featDict[feature]['WGD'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        valRange = maxVal-minVal
        box = ax.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
        ax.yaxis.set_tick_params(labelsize=14)
        ax.xaxis.set_tick_params(labelsize=14)
        bCount = 0
        for b in box['boxes']:
            bCount += 1
            if bCount ==1:
                #b.set_facecolor((0.9,0.2,0,0.7))
                b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
            if bCount == 2:
                #b.set_facecolor((0.9,0.3,0,0.4))
                b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
            if bCount == 3:
#                 b.set_facecolor((1,0.41,0,0.2))
                b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                bCount = 0
            b.set_ec('k')
        for m in box['medians']:
            m.set_lw(2)
            m.set_color('k')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.xaxis.set_ticks_position('none') 
        ax.plot([0.95,1.95],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
        ax.plot([2.05,3.05],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
        ax.plot([0.95,3.05],[maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
        ax.scatter([0.95,1.95,2.05,3.05,0.95,3.05],[maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                            maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                            maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax.text(1.45,maxVal + 1.3*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=14,ha='center')
        ax.text(2.55,maxVal + 1.3*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=14,ha='center')
        ax.text(2,maxVal + 2.4*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=14,ha='center')
#         ax.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=14,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=16)
        ax.set_ylabel(namesDict[feature][1],fontsize=16)
        
        ax.set_ylim(ymax=maxVal+3*(valRange/5))
        
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2,zorder=0)
        #     if feature == 'cdsLen':
#         break
flatAx[-1].set_visible(False)
plt.savefig('BOXPLOTS_PAIRWISE/final_seqStruc'+colUsed+'.png',bbox_inches='tight')
plt.show()

In [None]:
#plotting: reg features , no mix
def scale_lightness(rgb, scale_l):
    # convert rgb to hls
    h, l, s = colorsys.rgb_to_hls(*rgb)
    # manipulate h, l, s values and return as rgb
    return colorsys.hls_to_rgb(h, min(1, l * scale_l), s = s)
def formatPval(list1,list2,correction):
    pval = (mannwhitneyu(list1,list2,alternative='two-sided').pvalue)*correction
    if pval >= 0.05:
        return 'ns'
    elif pval >= 0.001:
        return round(pval,3)
    else:
        valList = '{:.2e}'.format(pval).split('e')
        return valList[0]+'x 10$^{-'+valList[1].lstrip('-').lstrip('0')+'}$'
colUsed = 'dupCat_maj'
featList = ['max_exp','specificity','transCount','motif_number_1k','PPIs']
namesDict = {'max_exp':('Maximal expression','log(TPM)'),'specificity':('Expression specificity','tau'),
             'transCount':('Number of isoforms','count'),'motif_number_1k':('Number of regulatory motifs','count'),
             'PPIs':('PPIs','log(count)')}
featDict = {}
for feature in featList:
    featDict[feature] = {}
    if feature == 'max_exp' or feature == 'PPIs':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,transform='log10_zeroes')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,transform='log10_zeroes')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,transform='log10')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,transform='log10_zeroes')
    elif feature == 'CAI':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,Filter='CAI != "Error-check"')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,Filter='CAI != "Error-check"')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,Filter='CAI != "Error-check"')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,Filter='CAI != "Error-check"')
    else:
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed)
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed)
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed)
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed)
        
fig, axes = plt.subplots(2,3,figsize = (17,10))
flatAx = []
for axList in axes:
    flatAx.extend(axList)
for feature, ax in zip(featList,flatAx):
    if feature == 'PPIs-b':
        dataList = [featDict[feature]['WGD'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        max2, min2 = 350, 450
        valRange = (maxVal-minVal) - (min2-max2)
        divider = make_axes_locatable(ax)
        perCentSize = str((((maxVal + 3*(valRange/5))-min2)/(max2-minVal))*100)+'%'
        ax2 = divider.new_vertical(size=perCentSize,pad='4%')
        fig.add_axes(ax2)
        for a in [ax,ax2]:
            box = a.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
            a.yaxis.set_tick_params(labelsize=14)
            a.xaxis.set_tick_params(labelsize=14)
            bCount = 0
            for b in box['boxes']:
                bCount += 1
                if bCount ==1:
                    b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
                if bCount == 2:
                    b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
                if bCount == 3:
                    b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                    bCount = 0
                b.set_ec('k')
            for m in box['medians']:
                m.set_lw(2)
                m.set_color('k')
            a.spines['top'].set_visible(False)
            a.spines['right'].set_visible(False)
            a.spines['bottom'].set_visible(False)
            a.xaxis.set_ticks_position('none') 
            a.plot([0.95,1.95],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
            a.plot([2.05,3.05],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
#             a.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
            a.plot([0.95,3.05],[maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],lw=1,color='k',zorder=2)
#             a.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
#             a.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
            a.scatter([0.95,1.95,2.05,3.05,0.95,3.05],[maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                              maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                              maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax2.text(1.45,maxVal + 1.3*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=14,ha='center')
        ax2.text(2.55,maxVal + 1.3*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax2.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=20,ha='center')
        ax2.text(2,maxVal + 2.4*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax2.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=20,ha='center')
#         ax2.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=20,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=14)
        ax.yaxis.set_label_coords(-0.12, 0.85)
        ax.set_ylim(ymin=0,ymax=max2)
        ax.set_ylabel(namesDict[feature][1],fontsize=14)
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2)
        ax2.set_facecolor((0,0,0,0.05))
        ax2.grid(color='w',lw=2)
        ax2.vlines(x=1,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=2,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=3,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
#         ax2.vlines(x=4,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.set_ylim(ymin=min2,ymax=maxVal+3*(valRange/5))
#         ax2.tick_params(bottom="off", labelbottom='off')
        ax2.xaxis.set_visible(False)
# data = np.linspace(1,100,num=30)
# data2 = np.linspace(50,100,num=30)
# data3 = np.linspace(1,75,num=30)
    else:
        dataList = [featDict[feature]['WGD'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        valRange = maxVal-minVal
        box = ax.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
        ax.yaxis.set_tick_params(labelsize=14)
        ax.xaxis.set_tick_params(labelsize=14)
        bCount = 0
        for b in box['boxes']:
            bCount += 1
            if bCount ==1:
                #b.set_facecolor((0.9,0.2,0,0.7))
                b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
            if bCount == 2:
                #b.set_facecolor((0.9,0.3,0,0.4))
                b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
            if bCount == 3:
#                 b.set_facecolor((1,0.41,0,0.2))
                b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                bCount = 0
            b.set_ec('k')
        for m in box['medians']:
            m.set_lw(2)
            m.set_color('k')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.xaxis.set_ticks_position('none') 
        ax.plot([0.95,1.95],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
        ax.plot([2.05,3.05],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
        ax.plot([1,3],[maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
        ax.scatter([0.95,1.95,2.05,3.05,0.95,3.05],[maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                          maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                          maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax.text(1.45,maxVal + 1.3*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=14,ha='center')
        ax.text(2.55,maxVal + 1.3*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=14,ha='center')
        ax.text(2,maxVal + 2.4*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=14,ha='center')
#         ax.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=14,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=16)
        ax.set_ylabel(namesDict[feature][1],fontsize=16)
        
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2,zorder=0)
        ax.set_ylim(ymax=maxVal+3*(valRange/5))
        #     if feature == 'cdsLen':
#         break
flatAx[-1].set_visible(False)
plt.savefig('BOXPLOTS_PAIRWISE/final_reg_no_mix_'+colUsed+'.png',bbox_inches='tight')
plt.show()

In [None]:
#plotting: constraint features, no mix
def scale_lightness(rgb, scale_l):
    # convert rgb to hls
    h, l, s = colorsys.rgb_to_hls(*rgb)
    # manipulate h, l, s values and return as rgb
    return colorsys.hls_to_rgb(h, min(1, l * scale_l), s = s)
def formatPval(list1,list2,correction):
    pval = (mannwhitneyu(list1,list2,alternative='two-sided').pvalue)*correction
    if pval >= 0.05:
        return 'ns'
    elif pval >= 0.001:
        return round(pval,3)
    else:
        valList = '{:.2e}'.format(pval).split('e')
        return valList[0]+'x 10$^{-'+valList[1].lstrip('-').lstrip('0')+'}$'
colUsed = 'dupCat_maj'
featList = ['evolRate','ess','mis_Z_score',
            'pLI_score','Phi','s_het',
            'EvoTol','loftool_percentile','RVIS']
namesDict = {'evolRate':('Rate of evolution','dN/dS'),'ess':('Cellular essentiality','-(CRISPR score)'),
             'mis_Z_score':('Missense Z score','score'),'pLI_score':('pLI','score'),'Phi':('Phi','score'),
             's_het':('S$_{het}$','score'),'EvoTol':('EvoTol','score'),'loftool_percentile':('LoFTool','percentile'),
             'RVIS':('RVIS','score')}
featDict = {}
for feature in featList:
    featDict[feature] = {}
    if feature == 'ess':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,transform='neg')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,transform='neg')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,transform='neg')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,transform='neg')
    elif feature == 'pLI_score':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,Filter='pLI_score != "NA"')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,Filter='pLI_score != "NA"')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,Filter='pLI_score != "NA"')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,Filter='pLI_score != "NA"')
    elif feature == 'RVIS':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,Filter='RVIS != "NA"')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,Filter='RVIS != "NA"')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,Filter='RVIS != "NA"')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,Filter='RVIS != "NA"')
    elif feature == 'EvoTol':
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed,Filter='EvoTol != "Not expressed above threshold in this ontology"')
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed,Filter='EvoTol != "Not expressed above threshold in this ontology"')
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed,Filter='EvoTol != "Not expressed above threshold in this ontology"')
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed,Filter='EvoTol != "Not expressed above threshold in this ontology"')
    else:
        featDict[feature]['WGD'] = getData(feature, '"WGD"',colUsed)
        featDict[feature]['SSD'] = getData(feature, '"SSD"',colUsed)
#         featDict[feature]['Mix'] = getData(feature, '"mix"',colUsed)
        featDict[feature]['Singleton'] = getData(feature, '"singleton"',colUsed)
        
fig, axes = plt.subplots(3,3,figsize = (17,15))
flatAx = []
for axList in axes:
    flatAx.extend(axList)
for feature, ax in zip(featList,flatAx):
    if feature == 'PPIs':
        dataList = [featDict[feature]['WGD'],featDict[feature]['Mix'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        max2, min2 = 350, 450
        valRange = (maxVal-minVal) - (min2-max2)
        divider = make_axes_locatable(ax)
        perCentSize = str((((maxVal + 4.3*(valRange/5))-min2)/(max2-minVal))*100)+'%'
        ax2 = divider.new_vertical(size=perCentSize,pad='3%')
        fig.add_axes(ax2)
        for a in [ax,ax2]:
            box = a.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
            a.yaxis.set_tick_params(labelsize=20)
            a.xaxis.set_tick_params(labelsize=22)
            bCount = 0
            for b in box['boxes']:
                bCount += 1
                if bCount ==1:
                    b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
                if bCount == 2:
                    b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
                if bCount == 3:
                    b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                    bCount = 0
                b.set_ec('k')
            for m in box['medians']:
                m.set_lw(2)
                m.set_color('k')
            a.spines['top'].set_visible(False)
            a.spines['right'].set_visible(False)
            a.spines['bottom'].set_visible(False)
            a.xaxis.set_ticks_position('none') 
            a.plot([1,1.95],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
            a.plot([2.05,2.95],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
#             a.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
            a.plot([1,3],[maxVal + 2*(valRange/5),maxVal+2*(valRange/5)],lw=1,color='k',zorder=2)
#             a.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
#             a.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
            a.scatter([1,1.95,2.05,2.95,1,3],[maxVal+valRange/5,maxVal+valRange/5,
                                                              maxVal+valRange/5,maxVal+valRange/5,
                                                              maxVal + 2*(valRange/5),maxVal+2*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax2.text(1.5,maxVal + 1.1*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=20,ha='center')
        ax2.text(2.5,maxVal + 1.1*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=20,ha='center')
#         ax2.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=20,ha='center')
        ax2.text(2,maxVal + 2.1*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=20,ha='center')
#         ax2.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=20,ha='center')
#         ax2.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=20,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=14,labelpad=14)
        ax.yaxis.set_label_coords(-0.08, 1.05)
        ax.set_ylim(ymin=0,ymax=max2)
        ax.set_ylabel(namesDict[feature][1],fontsize=20)
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2)
        ax2.set_facecolor((0,0,0,0.05))
        ax2.grid(color='w',lw=2)
        ax2.vlines(x=1,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=2,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.vlines(x=3,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
#         ax2.vlines(x=4,color='w',lw=2,ymin=ax2.get_ylim()[0],ymax=ax2.get_ylim()[1])
        ax2.set_ylim(ymin=min2,ymax=maxVal+3*(valRange/5))
#         ax2.tick_params(bottom="off", labelbottom='off')
        ax2.xaxis.set_visible(False)
# data = np.linspace(1,100,num=30)
# data2 = np.linspace(50,100,num=30)
# data3 = np.linspace(1,75,num=30)
    else:
        dataList = [featDict[feature]['WGD'],featDict[feature]['SSD'],featDict[feature]['Singleton']]
        maxVal = max([max(x) for x in dataList])
        minVal = min([min(x) for x in dataList])
        valRange = maxVal-minVal
        box = ax.boxplot(dataList,labels=['WGD','SSD','Singleton'],patch_artist=True,flierprops={'ms':1})
        ax.yaxis.set_tick_params(labelsize=14)
        ax.xaxis.set_tick_params(labelsize=14)
        bCount = 0
        for b in box['boxes']:
            bCount += 1
            if bCount ==1:
                #b.set_facecolor((0.9,0.2,0,0.7))
                b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
            if bCount == 2:
                #b.set_facecolor((0.9,0.3,0,0.4))
                b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
            if bCount == 3:
#                 b.set_facecolor((1,0.41,0,0.2))
                b.set_facecolor(scale_lightness((1,0.41,0),1.8))
                bCount = 0
            b.set_ec('k')
        for m in box['medians']:
            m.set_lw(2)
            m.set_color('k')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.xaxis.set_ticks_position('none') 
        ax.plot([0.95,1.95],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
        ax.plot([2.05,3.05],[maxVal + 1.1*(valRange/5),maxVal+1.1*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([3.05,4],[maxVal + valRange/5,maxVal+valRange/5],lw=1,color='k',zorder=2)
        ax.plot([1,3],[maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([2,4],[maxVal + 3*(valRange/5),maxVal+3*(valRange/5)],lw=1,color='k',zorder=2)
#         ax.plot([1,4],[maxVal + 4*(valRange/5),maxVal+4*(valRange/5)],lw=1,color='k',zorder=2)
        ax.scatter([0.95,1.95,2.05,3.05,0.95,3.05],[maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                          maxVal+1.1*(valRange/5),maxVal+1.1*(valRange/5),
                                                          maxVal + 2.2*(valRange/5),maxVal+2.2*(valRange/5)],color=(0.8,0,0,1),s=30,zorder=3)
        #pvals
        ax.text(1.45,maxVal + 1.3*(valRange/5),formatPval(dataList[0],dataList[1],len(featList)*3),fontsize=14,ha='center')
        ax.text(2.55,maxVal + 1.3*(valRange/5),formatPval(dataList[1],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3.5,maxVal + 1.1*(valRange/5),formatPval(dataList[2],dataList[3],len(featList)*4),fontsize=14,ha='center')
        ax.text(2,maxVal + 2.4*(valRange/5),formatPval(dataList[0],dataList[2],len(featList)*3),fontsize=14,ha='center')
#         ax.text(3,maxVal + 3.1*(valRange/5),formatPval(dataList[1],dataList[3],len(featList)*4),fontsize=14,ha='center')
#         ax.text(2.5,maxVal + 4.1*(valRange/5),formatPval(dataList[0],dataList[3],len(featList)*4),fontsize=14,ha='center')
       
        ax.set_xlabel(namesDict[feature][0],fontsize=16)
        ax.set_ylabel(namesDict[feature][1],fontsize=16)
        
        ax.set_facecolor((0,0,0,0.05))
        ax.grid(color='w',lw=2,zorder=0)
        ax.set_ylim(ymax=maxVal+3*(valRange/5))
        #     if feature == 'cdsLen':
#         break
# flatAx[-1].set_visible(False)
plt.savefig('BOXPLOTS_PAIRWISE/final_constraint_no_mix_'+colUsed+'.png',bbox_inches='tight')
plt.show()

In [None]:
#expression comparisons by tissue
cursor.execute('SELECT id FROM gene_features WHERE dupCat_maj == "WGD"')
wList = [x[0] for x in cursor.fetchall()]
cursor.execute('SELECT id FROM gene_features WHERE dupCat_maj == "SSD"')
sList = [x[0] for x in cursor.fetchall()]
cursor.execute('SELECT id FROM gene_features WHERE dupCat_maj == "Singleton (Vert)" OR dupCat_maj == "singleton"')
siList = [x[0] for x in cursor.fetchall()]

expDictByTissueWGD = {}
expDictByTissueSSD = {}
expDictByTissueSing = {}
for gene in exp_dict:
    valList = exp_dict[gene]
    for v in valList:
        tissue,val = v
        if gene in wList:
            try:
                expDictByTissueWGD[tissue].append(val)
            except KeyError:
                expDictByTissueWGD[tissue] = [val]
        elif gene in sList:
            try:
                expDictByTissueSSD[tissue].append(val)
            except KeyError:
                expDictByTissueSSD[tissue] = [val]
        elif gene in siList:
            try:
                expDictByTissueSing[tissue].append(val)
            except KeyError:
                expDictByTissueSing[tissue] = [val]
fig, axes = plt.subplots(3,1,figsize=(17,9),gridspec_kw={'hspace':0.45})


tissueList = [x for x in expDictByTissueWGD]
# list1 = tissueList[312:320]
# list2 = tissueList[320:328]
# list3 = tissueList[328:336]

list1 = ['10 week post conception, hindbrain',
         '5 week post conception, kidney',
         'Brain - Cerebellar Hemisphere',
         'Whole Blood',
         'Brain - Substantia nigra',
         'Liver'] #largest and smallest differences

list2 = ['7 week post conception, forebrain',
         '9 week post conception, forebrain',
         '10 week post conception, forebrain',
         '13 week post conception, forebrain',
         '16 week post conception, forebrain',
         '18 week post conception, forebrain'] #changes over time in hindbrain development

list3 = ['4 week post conception, liver',
         '5 week post conception, liver',
         '5 week post conception, liver',
         '11 week post conception, liver',
         '16 week post conception, liver',
         '18 week post conception, liver'] #changes over time in liver development
for tissues, ax in zip([list1,list2,list3],axes):
    startBox = 0.5
    for tissue in tissues:
        ax.set_ylabel('Log(TPM)',fontsize=11)
        bCount = 0
#         wAdd = (min([x for x in expDictByTissueWGD[t] if x > 0]))/2
#         sAdd = (min([x for x in expDictByTissueSSD[t] if x > 0]))/2
#         siAdd = (min([x for x in expDictByTissueSing[t] if x > 0]))/2
#         wVals = [log10(x) if x != 0 else log10(0.0001) for x in expDictByTissueWGD[tissue]]
#         sVals = [log10(x) if x != 0 else log10(0.0001) for x in expDictByTissueSSD[tissue]]
#         siVals = [log10(x) if x != 0 else log10(0.0001) for x in expDictByTissueSing[tissue]]
        wVals = [log10(x) for x in expDictByTissueWGD[tissue] if x > 1]
        sVals = [log10(x) for x in expDictByTissueSSD[tissue] if x > 1]
        siVals = [log10(x) for x in expDictByTissueSing[tissue] if x > 1]
        
        totVals = []
        totVals.extend(wVals)
        totVals.extend(sVals)
        totVals.extend(siVals)
    #     positions= [startBox,startBox+0.25,startBox+0.5,startBox+0.75]
    #     print(positions)
        boxes = ax.boxplot([totVals,wVals,sVals,siVals],
                           labels=['Total','WGD','SSD','Singleton'],
                           positions=[startBox,startBox+0.5,startBox+1,startBox+1.5],
                           patch_artist=True,flierprops={'ms':0.5})

        for b in boxes['boxes']:
            bCount += 1
            if bCount == 2:
                #b.set_facecolor((0.9,0.2,0,0.7))
                b.set_facecolor(scale_lightness((0.9,0.2,0),1.4))
            if bCount == 3:
                #b.set_facecolor((0.9,0.3,0,0.4))
                b.set_facecolor(scale_lightness((0.9,0.3,0),1.8))
            if bCount == 1 or bCount == 4:
    #                 b.set_facecolor((1,0.41,0,0.2))
                b.set_facecolor(scale_lightness((1,0.41,0),1.8))
#                 bCount = 0
            b.set_ec('k')
        for m in boxes['medians']:
            m.set_lw(2)
            m.set_color('k')
        if ax == axes[0]:
            ax.text(startBox+0.75,5.9,tissue.replace('post conception','p.c.').replace('week','weeks'),ha='center',fontsize=11.2)
        elif ax == axes[1]:
            ax.text(startBox+0.75,4.4,tissue.replace('post conception','p.c.').replace('week','weeks'),ha='center',fontsize=11.2)
        else:
            ax.text(startBox+0.75,5.6,tissue.replace('post conception','p.c.').replace('week','weeks'),ha='center',fontsize=11.2)
        startBox = startBox + 2.5
        ax.grid(color='w')
        ax.set_fc((0,0,0,0.05))
        if len(tissues) == 1:
            ax.set_xlim(axes[0].get_xlim())
#         ax.set_yticks(np.arange(-4,5,2))
        ax.yaxis.set_tick_params(labelsize=11)
        ax.xaxis.set_tick_params(labelsize=11.2,labelrotation=15)
#         ax.set_xticklabels(['WGD','SSD','Singleton','Total'],fontsize=12,rotation=20,ha='right')
plt.savefig('EXP_METH_BOXPLOTS/exp_finalRepresentativeSample.png')
plt.show()

# GO enrichment analysis

In [None]:
colUsed = 'dupCat_maj'
cursor.execute('SELECT id FROM gene_features WHERE ' + colUsed + ' == "WGD"')
wList = [x[0] for x in cursor.fetchall()]
cursor.execute('SELECT id FROM gene_features WHERE ' + colUsed + ' == "SSD"')
sList = [x[0] for x in cursor.fetchall()]
cursor.execute('SELECT id FROM gene_features WHERE ' + colUsed + ' == "singleton" OR ' + colUsed + ' == "Singleton (Vert)"')
siList = [x[0] for x in cursor.fetchall()]
cursor.execute('SELECT id FROM gene_features WHERE ' + colUsed + ' == "mix"')
miList = [x[0] for x in cursor.fetchall()]
total = []
for l in [wList, sList, siList, miList]:
    total.extend(l)
    
gp = GProfiler()
#default threshold of 0.05
overWgdRes = gp.profile(query=wList, organism='hsapiens',significance_threshold_method='fdr',background=total)
underWgdRes = gp.profile(query=wList, organism='hsapiens',significance_threshold_method='fdr',background=total,measure_underrepresentation=True)

overWgdRes = [x for x in overWgdRes if 'GO' in x['source']]
underWgdRes = [x for x in underWgdRes if 'GO' in x['source']]
termNameOver = [(x['native'],x['name'],str(x['p_value'])) for x in overWgdRes]
termNameUnder = [(x['native'],x['name'],str(x['p_value'])) for x in underWgdRes]
wOver = [x['native'] for x in overWgdRes]
wUnder = [x['native'] for x in underWgdRes]
with open('overRepresentedWGD'+colUsed+'.txt','w') as file:
    for x in termNameOver:
        file.write('\t'.join(x)+'\n')
with open('underRepresentedWGD'+colUsed+'.txt','w') as file:
    for x in termNameUnder:
        file.write('\t'.join(x)+'\n')
        
overSsdRes = gp.profile(query=sList, organism='hsapiens',significance_threshold_method='fdr',background=total)
underSsdRes = gp.profile(query=sList, organism='hsapiens',significance_threshold_method='fdr',background=total,measure_underrepresentation=True)

overSsdRes = [x for x in overSsdRes if 'GO' in x['source']]
underSsdRes = [x for x in underSsdRes if 'GO' in x['source']]
termNameOver = [(x['native'],x['name'],str(x['p_value'])) for x in overSsdRes]
termNameUnder = [(x['native'],x['name'],str(x['p_value'])) for x in underSsdRes]
sOver = [x['native'] for x in overSsdRes]
sUnder = [x['native'] for x in underSsdRes]
with open('overRepresentedSSD'+colUsed+'.txt','w') as file:
    for x in termNameOver:
        file.write('\t'.join(x)+'\n')
with open('underRepresentedSSD'+colUsed+'.txt','w') as file:
    for x in termNameUnder:
        file.write('\t'.join(x)+'\n')
        
overSingRes = gp.profile(query=siList, organism='hsapiens',significance_threshold_method='fdr',background=total)
underSingRes = gp.profile(query=siList, organism='hsapiens',significance_threshold_method='fdr',background=total,measure_underrepresentation=True)

overSingRes = [x for x in overSingRes if 'GO' in x['source']]
underSingRes = [x for x in underSingRes if 'GO' in x['source']]
termNameOver = [(x['native'],x['name'],str(x['p_value'])) for x in overSingRes]
termNameUnder = [(x['native'],x['name'],str(x['p_value'])) for x in underSingRes]
siOver = [x['native'] for x in overSingRes]
siUnder = [x['native'] for x in underSingRes]
with open('overRepresentedSingleton'+colUsed+'.txt','w') as file:
    for x in termNameOver:
        file.write('\t'.join(x)+'\n')
with open('underRepresentedSingleton'+colUsed+'.txt','w') as file:
    for x in termNameUnder:
        file.write('\t'.join(x)+'\n')
        
overMixRes = gp.profile(query=miList, organism='hsapiens',significance_threshold_method='fdr',background=total)
underMixRes = gp.profile(query=miList, organism='hsapiens',significance_threshold_method='fdr',background=total,measure_underrepresentation=True)

overMixRes = [x for x in overMixRes if 'GO' in x['source']]
underMixRes = [x for x in underMixRes if 'GO' in x['source']]
termNameOver = [(x['native'],x['name'],str(x['p_value'])) for x in overMixRes]
termNameUnder = [(x['native'],x['name'],str(x['p_value'])) for x in underMixRes]
miOver = [x['native'] for x in overMixRes]
miUnder = [x['native'] for x in underMixRes]
with open('overRepresentedMix'+colUsed+'.txt','w') as file:
    for x in termNameOver:
        file.write('\t'.join(x)+'\n')
with open('underRepresentedMix'+colUsed+'.txt','w') as file:
    for x in termNameUnder:
        file.write('\t'.join(x)+'\n')
catContent = {'WGD enriched':wOver,'SSD enriched': sOver, 'Singleton enriched':siOver, 'Mixed enriched':miOver,
             'WGD depleted':wUnder, 'SSD depleted':sUnder, 'Singleton depleted':siUnder, 'Mixed depleted':miUnder}
uplot(from_contents(catContent),sort_by='cardinality',show_counts = True)
plt.savefig('UPSET_PLOTS/upsetPlot_go_'+colUsed+'.svg',bbox_inches='tight')

In [None]:
catContent = {'WGD enriched':wOver,'SSD enriched': sOver, 'Singleton enriched':siOver,
             'WGD depleted':wUnder, 'SSD depleted':sUnder, 'Singleton depleted':siUnder}
plot = uplot(from_contents(catContent),sort_by='cardinality',show_counts = True)
plot['matrix'].set_yticklabels(plot['matrix'].get_yticklabels(),fontsize=12)
# labs = plot['intersections'].get_ymajorticklabels()
plot['intersections'].set_yticklabels(labs,fontsize=12)
plot['intersections'].set_ylabel('Intersection size',fontsize=12)
box = plot['totals'].get_position()
box.x0 = box.x0 - 0.05
box.x1 = box.x1 -0.05
plot['totals'].set_position(box)
for t in plot['totals'].texts:
    t.set_fontsize(12)
for t in plot['intersections'].texts:
    t.set_fontsize(11)
plt.savefig('UPSET_PLOTS/upsetPlot_go_WGD_SSD_Sing'+colUsed+'.svg',bbox_inches='tight')

In [None]:
catContent = {'WGD enriched':wOver,'Singleton enriched':siOver,
             'WGD depleted':wUnder,'Singleton depleted':siUnder}
plot = uplot(from_contents(catContent),sort_by='cardinality',show_counts = True)
plot['matrix'].set_yticklabels(plot['matrix'].get_yticklabels(),fontsize=12)
# labs = plot['intersections'].get_ymajorticklabels()
plot['intersections'].set_yticklabels(labs,fontsize=12)
plot['intersections'].set_ylabel('Intersection size',fontsize=12)
box = plot['totals'].get_position()
box.x0 = box.x0 - 0.05
box.x1 = box.x1 -0.05
plot['totals'].set_position(box)
for t in plot['totals'].texts:
    t.set_fontsize(12)
for t in plot['intersections'].texts:
    t.set_fontsize(12)
plt.savefig('UPSET_PLOTS/upsetPlot_go_WGD_Sing'+colUsed+'.svg',bbox_inches='tight')

In [None]:
catContent = {'SSD enriched':sOver,'Singleton enriched':siOver,
             'SSD depleted':sUnder,'Singleton depleted':siUnder}
plot = uplot(from_contents(catContent),sort_by='cardinality',show_counts = True)
plot['matrix'].set_yticklabels(plot['matrix'].get_yticklabels(),fontsize=12)
# labs = plot['intersections'].get_ymajorticklabels()
plot['intersections'].set_yticklabels(labs,fontsize=12)
plot['intersections'].set_ylabel('Intersection size',fontsize=12)
box = plot['totals'].get_position()
box.x0 = box.x0 - 0.05
box.x1 = box.x1 -0.05
plot['totals'].set_position(box)
for t in plot['totals'].texts:
    t.set_fontsize(12)
for t in plot['intersections'].texts:
    t.set_fontsize(11)
plt.savefig('UPSET_PLOTS/upsetPlot_go_SSD_Sing'+colUsed+'.svg',bbox_inches='tight')

In [None]:
catContent = {'WGD enriched':wOver,'SSD enriched': sOver,
             'WGD depleted':wUnder, 'SSD depleted':sUnder,}
plot = uplot(from_contents(catContent),sort_by='cardinality',show_counts = True)
plot['matrix'].set_yticklabels(plot['matrix'].get_yticklabels(),fontsize=12)
# labs = plot['intersections'].get_ymajorticklabels()
plot['intersections'].set_yticklabels(labs,fontsize=12)
plot['intersections'].set_ylabel('Intersection size',fontsize=12)
box = plot['totals'].get_position()
box.x0 = box.x0 - 0.05
box.x1 = box.x1 -0.05
plot['totals'].set_position(box)
for t in plot['totals'].texts:
    t.set_fontsize(12)
for t in plot['intersections'].texts:
    t.set_fontsize(12)
plt.savefig('UPSET_PLOTS/upsetPlot_go_WGD_SSD'+colUsed+'.svg',bbox_inches='tight')