In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import pyensembl

pd.set_option('display.max_columns', None)
os.chdir("/home/ylee/blast/result/")

In [2]:
genome = pyensembl.EnsemblRelease(100, "mus_musculus")

In [3]:
blast_Actg1 = pd.read_csv("ENSMUST00000071555.12.transcript.blastn.tsv.parsed.anno.final", sep="\t", header=0)

In [4]:
blast_Actg1

Unnamed: 0,query id,query gene_id,query gene_symbol,subject id,% identity,alignment length,q. start,q. end,q. strand,query length,s. start,s. end,subject strand,subject length,mismatches,gaps,evalue,bit score,feature,feat_start,feat_end,feat_strand,feat_ovl_peak,peak_ovl_feat,gene_id,gene_name,gene_type
0,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chr10,88.93,1039,51,1085,-,1946,25053288,25054301,+,130694993,86,29,0.0,1254.0,"gene,gene,promoter_proximal,promoter_proximal",25053359250541400000000000000000,25054685250542200000000000000000,"+,+,+,+","0.93,0.09,0.85,0.08","0.72,1.0,0.43,0.04","ENSMUSG00000112597,ENSMUSG00000093282,ENSMUSG0...","AC153974.2,Gm22566,Gm22566,AC153974.2","processed_pseudogene,miRNA,miRNA,processed_pse..."
1,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chr10,91.09,875,1081,1945,-,1946,33893385,33894256,-,130694993,65,13,0.0,1171.0,gene,33894093,33894256,-,0.19,1,ENSMUSG00000112534,AC153961.2,processed_pseudogene
2,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chr10,92.13,635,1316,1946,-,1946,25054678,25055312,+,130694993,46,4,0.0,893.0,gene,25053359,25054685,+,0.02,0.01,ENSMUSG00000112597,AC153974.2,processed_pseudogene
3,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chr11,82.16,953,1020,1946,-,1946,107191427,107192345,+,122082543,110,60,0.0,763.0,gene,107191094,107191630,+,0.23,0.38,ENSMUSG00000081360,Gm11718,processed_pseudogene
4,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chr11,89.20,1871,108,1946,-,1946,12936134,12937970,-,122082543,136,66,0.0,2276.0,"gene,promoter_proximal",1293683412937930,1293793812939930,"-,-","0.61,0.02","1.0,0.02","ENSMUSG00000083859,ENSMUSG00000083859","Gm12003,Gm12003","processed_pseudogene,processed_pseudogene"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chrX,86.11,36,155,190,-,1946,133210507,133210542,+,171031299,5,0,5.1,39.9,gene,133210486,133211520,+,1,0.04,ENSMUSG00000083858,Gm14979,processed_pseudogene
257,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chrX,86.84,38,1820,1856,-,1946,155220562,155220596,-,171031299,1,4,5.1,39.9,,,,,,,,,
258,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chrX,90.00,30,149,178,-,1946,133737859,133737888,-,171031299,3,0,5.1,39.9,,,,,,,,,
259,ENSMUST00000071555.12,ENSMUSG00000062825,Actg1,chrX,95.83,24,816,839,-,1946,117265226,117265249,-,171031299,1,0,5.1,39.9,,,,,,,,,


## Function

In [5]:
# function that return gene IDs in locus
def find_gene_in_locus(chrNum, start, end, strand):
    gene_ids_at_locus = genome.gene_ids_at_locus(contig=chrNum, position=start, end=end)   # genes_at_locus is also possible
    gene_ids_at_locus += genome.gene_ids_at_locus(contig=chrNum, position=start)
    gene_ids_at_locus += genome.gene_ids_at_locus(contig=chrNum, position=end)
    
    return gene_ids_at_locus

In [6]:
chromosome_sizes = {
    "1": 195471971,
    "2": 182113224,
    "3": 160039680,
    "4": 156508116,
    "5": 151834684,
    "6": 149736546,
    "7": 145441459,
    "8": 129401213,
    "9": 124595110,
    "10": 130694993,
    "11": 122082543,
    "12": 120129022,
    "13": 120421639,
    "14": 124902244,
    "15": 104043685,
    "16": 98207768,
    "17": 94987271,
    "18": 90702639,
    "19": 61431566,
    "X": 171031299,
    "Y": 91744698,
    "MT": 16299
}

In [7]:
# function that extends hit locus
def extend_hit(chrNum, start, end, strand, extendLength = 1000):
    chr_length = chromosome_sizes[chrNum]                              
    move_start = start - extendLength
    move_end = end + extendLength
    annotation = "both"

    if move_start < 1 and move_end > chr_length:
        return [1, chr_length, "last extension"]
    else:
        if move_start < 1: 
            move_start = 1
            annotation = "upstream"
        elif move_end > chr_length: 
            move_end = chr_length
            annotation = "downstream"
            
    return [move_start, move_end, annotation]

In [8]:
# gene의 TSS, and distance
def check_distance(gene_id, hit_locus):
    
    # create locus from gene_id
    gene = genome.gene_by_id(gene_id)
    gene_locus = pyensembl.Locus(contig=gene.contig, start=gene.start, end=gene.end, strand=gene.strand)
    if gene.strand == "+":
        TSS_point = gene.start
    else:    
        TSS_point = gene.end

    # calculate the distance
    # distance = hit_locus.distance_to_locus(gene_locus)              # if strand is different, (+/-) they calculate distance as infinity.
    if hit_locus.distance_to_interval(gene.start, gene.end) == 0:
        distance = 0
    else: 
        distance = hit_locus.distance_to_interval(TSS_point, TSS_point)
    
    return distance

## Main1 - using Carsten's' Gene annotation

In [9]:
similar_pcg_set = set()

i = 0
while i < len(blast_Actg1): 
    row = blast_Actg1.iloc[i]

    hit_chrNum = row['subject id'][3:]
    hit_start = int(row['s. start'])
    hit_end =  int(row['s. end'])
    hit_strand = row['subject strand']

    # create hit Locus object
    hit_locus = pyensembl.Locus(contig=hit_chrNum, start=hit_start, end=hit_end, strand=hit_strand)

    gene_id = []
    gene_type = []
    if pd.notna(row['gene_id']):
        gene_id = row['gene_id'].split(',')
    if pd.notna(row['gene_type']):
        gene_type = row['gene_type'].split(',')
    
    for j in range(0, len(gene_id)):
#         if check_distance(gene_id[j], hit_locus) < 2000 and gene_type[j] == 'protein_coding':   # same result
        if gene_type[j] == 'protein_coding':                                                      # same result
            similar_pcg_set.add(gene_id[j])

    i += 1

len(similar_pcg_set)

41

## Main2 - add 2kb distance at hit locus

In [78]:
similar_pcg_set = set()

i = 0
while i < len(blast_Actg1): 
    row = blast_Actg1.iloc[i]

    hit_chrNum = row['subject id'][3:]
    hit_start = int(row['s. start'])
    hit_end =  int(row['s. end'])
    hit_strand = row['subject strand']

    # create hit Locus object
    hit_locus = pyensembl.Locus(contig=hit_chrNum, start=hit_start, end=hit_end, strand=hit_strand)

    extend = "NA"
    while (extend != "false"):
        extend_hit_result = extend_hit(hit_chrNum, hit_start, hit_end, hit_strand, 2000)    # default extension
        hit_start = extend_hit_result[0]
        hit_end = extend_hit_result[1]
        gene_in_locus_list = find_gene_in_locus(hit_chrNum, hit_start, hit_end, hit_strand)             # find gene in locus
        if len(gene_in_locus_list) == 0:                                                                # if there is no gene in locus, extend hit length
            extend_hit_result = extend_hit(hit_chrNum, hit_start, hit_end, hit_strand, 1000)
            hit_start = extend_hit_result[0]
            hit_end = extend_hit_result[1]
            extend = extend_hit_result[2]
        else:                                                                                            # if there is gene in locus, print the gene id and distance
            # check distance between hit locus and gene
            info = ""
            for gene_id in gene_in_locus_list: 
                distance = check_distance(gene_id, hit_locus)
                gene = genome.gene_by_id(gene_id);
                info += f"{gene_id} - Distance: {distance}; "
                if distance < 2000 and gene.biotype == "protein_coding":
                    #################
                    if gene_id == "ENSMUSG00000032452" or gene_id == "ENSMUSG00000064115" or gene_id == "ENSMUSG00000092329" or gene_id == "ENSMUSG00000118215":
                        print("hit info: ", hit_locus.contig, hit_locus.start, hit_locus.end, hit_locus.strand,
                              "\ngene info: ", gene.start, gene.end, gene.strand, gene_id, distance)
                    #################
                    similar_pcg_set.add(gene_id)
            
            # Update DataFrame, add gene and extend information
            # blast_Actg1_local_ws4.at[i, 'genes'] = info
            # blast_Actg1_local_ws4.at[i, 'extend'] = extend
            extend = 'false'

    i += 1

print(len(similar_pcg_set))

hit info:  7 5128966 5129315 - 
gene info:  5129291 5149670 - ENSMUSG00000118215 0
hit info:  7 5128966 5129315 - 
gene info:  5129291 5149670 - ENSMUSG00000118215 0
hit info:  8 123893148 123893467 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123893148 123893467 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123893148 123893467 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892009 123892152 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892009 123892152 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892009 123892152 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892549 123892724 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892549 123892724 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 123892549 123892724 - 
gene info:  119910841 124345722 + ENSMUSG00000092329 0
hit info:  8 12

In [10]:
os.chdir("/home/ylee/blast/result/")

pcg_ids = [gene.gene_id for gene in genome.genes() if gene.biotype == 'protein_coding']
pcg_set = set(pcg_ids)
nonsimilar_pcg_set = pcg_set - similar_pcg_set

# Save the set as a pickle file
with open('similar_pcg_set_C.pkl', 'wb') as file:
    pickle.dump(similar_pcg_set, file)

with open('nonsimilar_pcg_set_C.pkl', 'wb') as file:
    pickle.dump(nonsimilar_pcg_set, file)

with open('pcg_set.pkl', 'wb') as file:
    pickle.dump(pcg_set, file)

## Check difference between result of J and K

In [30]:
with open('similar_pcg_set_Y.pkl', 'rb') as file:
    similar_pcg_set_Y = pickle.load(file)

with open('nonsimilar_pcg_set_Y.pkl', 'rb') as file:
    nonsimilar_pcg_set_Y = pickle.load(file)

with open('similar_pcg_set_C.pkl', 'rb') as file:
    similar_pcg_set_C = pickle.load(file)

with open('nonsimilar_pcg_set_C.pkl', 'rb') as file:
    nonsimilar_pcg_set_C = pickle.load(file)

with open('pcg_set.pkl', 'rb') as file:             
    pcg_set = pickle.load(file)                        # from pyensembl

In [31]:
similar_pcg_set_Y - similar_pcg_set_C

{'ENSMUSG00000008658',
 'ENSMUSG00000025812',
 'ENSMUSG00000032452',
 'ENSMUSG00000062825',
 'ENSMUSG00000064115',
 'ENSMUSG00000070979',
 'ENSMUSG00000092329',
 'ENSMUSG00000118215'}

In [58]:
similar_pcg_set_C - similar_pcg_set_Y

{'ENSMUSG00000023826'}

In [59]:
similar_pcg_set - similar_pcg_set_C

{'ENSMUSG00000032452',
 'ENSMUSG00000064115',
 'ENSMUSG00000092329',
 'ENSMUSG00000118215'}

In [75]:
similar_pcg_set_Y - similar_pcg_set

{'ENSMUSG00000008658',
 'ENSMUSG00000025812',
 'ENSMUSG00000062825',
 'ENSMUSG00000070979'}

## Codes from K *need modification

In [33]:
Carsten_annotated = pd.read_csv("ENSMUST00000071555.12.transcript.blastn.tsv.parsed.anno.final", sep="\t", header=0)

def check_overlapped(start_1, end_1, start_2, end_2):
    if end_2 >= start_1 or end_1 >= start_2:
        return True
    else:
        return False

final_ls = []
all_ls = [] 
other_ls = []
for i in range(len(Carsten_annotated)):
    chr_ = Carsten_annotated.iloc[i]['subject id']
    start = Carsten_annotated.iloc[i]['s. start'] 
    end = Carsten_annotated.iloc[i]['s. end']
    item = Carsten_annotated.iloc[i]['gene_id']
    if isinstance(item, str):
        ls = item.split(',')
        
        for gene_id in ls:
            gene = genome.gene_by_id(gene_id)
            g_start = gene.start  
            g_end = gene.end
            if check_overlapped(g_start, g_end, start, end): # if overlapped with gene body
                final_ls.append(gene_id)
            else: #check if overlapped with TSS to -2000bp upstream region
                if gene.strand == '-':
                    TSS = gene.end
                    if end - TSS <=2000 or start - TSS <=2000:
                        final_ls.append(gene_id)
                    else:
                        other_ls.append((gene_id, TSS, chr_, start, end))  
                    
                else:
                    TSS = gene.start
                    if TSS - end <=2000 or TSS - start<=2000:
                        final_ls.append(gene_id)
                    else:
                        other_ls.append((gene_id, TSS, chr_, start, end))  
                
          
            all_ls.append(gene_id)
            
protein_coding_ls = []
for gene_id in list(set(final_ls)): 
    if genome.gene_by_id(gene_id).biotype =='protein_coding':
        protein_coding_ls.append(gene_id)

similar_pcg_set_K = set(protein_coding_ls)
print(len(similar_pcg_set_K))

41
