# From the fasta and clustered fasta gene sequences, return a list of genes that are interesting to the study.

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook
from Bio import SeqIO

In [2]:
def get_seq(row):
    if row['contig_name'] == '-':
        return '-'
    assembly = '/home/bay001/projects/kes_20160307/org/03_output/assembly/gene_sequences/kestrel_blast_clustered_transcripts.fa'
    name = row['contig_name']
    handle = open(assembly, "rU")
    for record in SeqIO.parse(handle, "fasta"):
        if name in record.id:
            handle.close()
            return(str(record.seq))

In [3]:
# get all the genes
seqs = glob.glob('/home/bay001/projects/kes_20160307/org/03_output/assembly/gene_sequences/*.clustered.fasta')
ensembl_genes = '/home/bay001/projects/kes_20160307/org/03_output/assembly/ensembl_genes.txt'
if not os.path.exists(ensembl_genes):
    with open(ensembl_genes,'w') as f:
        f.write('Gene\n')
        for seq in seqs:
            f.write(os.path.basename(seq.replace('.clustered.fasta','')+"\n"))

In [4]:
ensembl2desc = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/03_output/csvs/ensembl-genename-description.tsv',
    sep='\t'
)
ensembl2desc.head(2)
ensembl2desc.shape

(12756, 3)

In [5]:
# apply sequence attributes
contig2ensembl = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/03_output/csvs/contig_and_gene_name.txt'
)
contig2ensembl.shape

(38289, 3)

In [6]:
# merge the contigs to the gene name/description attributes
contig2name = pd.merge(
    contig2ensembl,
    ensembl2desc,
    how='left',
    left_on=['ensembl','name'],
    right_on=['Gene ID','Associated Gene Name'],
    copy=False,
)
del contig2name['ensembl'] # redundant after merge
del contig2name['name'] # redundant after merge
contig2name.head(2)

Unnamed: 0,contig_name,Gene ID,Description,Associated Gene Name
0,unmapped-49-contig_list_contig_33037-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1
1,EC-4AK111_TAGCTT_R1_(paired)_contig_2427-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1


In [7]:
# with the help of biomart output
ens2entrez = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/00_data/references/biomart/ensembl2entrez.txt',
    dtype={'EntrezGene ID': str}).fillna('-')
ens2entrez.head()

Unnamed: 0,Gene ID,EntrezGene ID
0,ENSGALG00000041922,-
1,ENSGALG00000036956,-
2,ENSGALG00000032059,-
3,ENSGALG00000043598,-
4,ENSGALG00000040296,-


In [8]:
# merge the ensembl+contig information to the entrez info
merged = pd.merge(
    contig2name,
    ens2entrez,
    how='left',
    left_on='Gene ID',
    right_on='Gene ID'
).fillna('-')
merged.head()

Unnamed: 0,contig_name,Gene ID,Description,Associated Gene Name,EntrezGene ID
0,unmapped-49-contig_list_contig_33037-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033
1,EC-4AK111_TAGCTT_R1_(paired)_contig_2427-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033
2,unmapped-49-contig_list_contig_117584-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033
3,unmapped-49-contig_list_contig_288882-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033
4,unmapped-49-contig_list_contig_35316-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033


# Map to KEGG

In [9]:
# list pathways, from R script output
kegg_pathways = glob.glob(
    '/home/bay001/projects/kes_20160307/org/00_data/references/kegg/gga*'
)
print(len(kegg_pathways))
kegg_pathways[:3]

154


['/home/bay001/projects/kes_20160307/org/00_data/references/kegg/gga00650 Butanoate metabolism.txt',
 '/home/bay001/projects/kes_20160307/org/00_data/references/kegg/gga00591 Linoleic acid metabolism.txt',
 '/home/bay001/projects/kes_20160307/org/00_data/references/kegg/gga00380 Tryptophan metabolism.txt']

In [22]:
out_dir = '/home/bay001/projects/kes_20160307/org/03_output/kegg/'
assembly = '/home/bay001/projects/kes_20160307/org/03_output/assembly/gene_sequences/kestrel_blast_clustered_transcripts.fa'

def get_seq(row, assembly=assembly):
    if row['contig_name'] == '-':
        return '-'
    name = row['contig_name']
    handle = open(assembly, "rU")
    for record in SeqIO.parse(handle, "fasta"):
        if name in record.id:
            handle.close()
            return(str(record.seq))

def create_pathway_lists_and_seq(kegg_pathways, merged):
    progress = tnrange(len(kegg_pathways))
    for pathway in kegg_pathways:
        header = os.path.basename(pathway)
        out_file = os.path.join(
            out_dir,
            header.replace('.txt','.mapped2kestrel.txt').replace(' ','_')
        )
        pathway_df = pd.read_table(
            pathway,
            names=[header],
            skiprows=1,
            dtype={header:str}
        )
        entrez2kegg = pd.merge(
            pathway_df, 
            merged, 
            how='left', 
            left_on=[os.path.basename(pathway)],
            right_on=['EntrezGene ID']
        ).fillna('-')
        entrez2kegg['seq'] = entrez2kegg.apply(get_seq,axis=1)
        entrez2kegg.to_csv(
            out_file,
            sep='\t',
            index=False,
        )
        progress.update(1)
        
# create_pathway_lists_and_seq(kegg_pathways, merged)

# Convert KO -> swissprot -> ensembl

In [14]:
# one (KO) to many (sp)
ko2swiss = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/00_data/references/orthology_swiss-prot.list', 
    names=['ko','sp','direction']
)
ko2swiss['sp'] = ko2swiss['sp'].str.replace('sp:','')
ko2swiss.head()

Unnamed: 0,ko,sp,direction
0,ko:K00001,A1L4Y2,reverse
1,ko:K00001,B4M8Y0,reverse
2,ko:K00001,P00334,reverse
3,ko:K00001,P09369,reverse
4,ko:K00001,P09370,reverse


In [15]:
# one (gga) to one (sp)
gga2swiss = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/00_data/references/swiss_gga.list',
    names=['sp','gga','eq']
)
gga2swiss['sp'] = gga2swiss['sp'].str.replace('sp:','')

gga2swiss.head()

Unnamed: 0,sp,gga,eq
0,A0A1D5PRR9,gga:100857997,equivalent
1,A0AVX7,gga:771113,equivalent
2,A0M8U1,gga:417770,equivalent
3,A1KXM5,gga:418848,equivalent
4,A3R064,gga:429960,equivalent


In [16]:
gga2ensembl = pd.read_table(
    '/home/bay001/projects/kes_20160307/org/00_data/references/gga_ensembl.list',
    names=['gga','ensembl','what']
)
gga2ensembl['ensembl'] = gga2ensembl['ensembl'].str.replace('ensembl:','')
gga2ensembl.head()

Unnamed: 0,gga,ensembl,what
0,gga:100008585,ENSGALG00000026850,original
1,gga:100034745,ENSGALG00000012396,original
2,gga:100049617,ENSGALG00000042215,original
3,gga:100049619,ENSGALG00000032986,original
4,gga:100113359,ENSGALG00000037735,original


In [17]:
def ko_to_ensembl():
    """ maps KEGG ortholog IDs to chicken ensembl gene IDs """
    ko_to_gga = pd.merge(ko2swiss, gga2swiss, how='right', on='sp')
    ko_to_ensembl = pd.merge(ko_to_gga, gga2ensembl, how='left', on='gga')
    return ko_to_ensembl

In [18]:
# format the dataframe
ko2ensembl_df = ko_to_ensembl()
ko2ensembl_df['ko'] = ko2ensembl_df['ko'].str.replace('ko:','')
ko2ensembl_df = ko2ensembl_df[['ko','sp','gga','ensembl']]
ko2ensembl_df.columns = ['KEGG Orthology ID', 'Swissprot ID', 'KEGG Organism-specific ID', 'Ensembl ID']
ko2ensembl_df.head()

Unnamed: 0,KEGG Orthology ID,Swissprot ID,KEGG Organism-specific ID,Ensembl ID
0,K00002,Q5ZK84,gga:424599,ENSGALG00000010244
1,K00012,Q5F3T9,gga:422792,ENSGALG00000041993
2,K00016,P00337,gga:373997,ENSGALG00000035836
3,K00016,P00340,gga:396221,ENSGALG00000006300
4,K00025,Q5ZME2,gga:421281,ENSGALG00000008858


In [19]:
# merge with main annotation dataframe
merged_with_ko = pd.merge(
    merged, 
    ko2ensembl_df, 
    how='left', 
    left_on=['Gene ID'],
    right_on=['Ensembl ID']
).fillna('-')
merged_with_ko.head()

Unnamed: 0,contig_name,Gene ID,Description,Associated Gene Name,EntrezGene ID,KEGG Orthology ID,Swissprot ID,KEGG Organism-specific ID,Ensembl ID
0,unmapped-49-contig_list_contig_33037-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033,K02580,Q04861,gga:396033,ENSGALG00000012304
1,EC-4AK111_TAGCTT_R1_(paired)_contig_2427-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033,K02580,Q04861,gga:396033,ENSGALG00000012304
2,unmapped-49-contig_list_contig_117584-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033,K02580,Q04861,gga:396033,ENSGALG00000012304
3,unmapped-49-contig_list_contig_288882-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033,K02580,Q04861,gga:396033,ENSGALG00000012304
4,unmapped-49-contig_list_contig_35316-0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1,396033,K02580,Q04861,gga:396033,ENSGALG00000012304


In [29]:
# list pathways, from R script output
kegg_pathways = glob.glob(
    '/home/bay001/projects/kes_20160307/org/00_data/references/kegg/ko*thyroid*'
)
print(len(kegg_pathways))
out_dir = '/home/bay001/projects/kes_20160307/org/03_output/kegg_reference/'

def create_pathway_lists_and_seq(kegg_pathways, annotation_df):
    progress = tnrange(len(kegg_pathways))
    for pathway in kegg_pathways:
        header = os.path.basename(pathway)
        out_file = os.path.join(
            out_dir,
            header.replace('.txt','.mapped2kestrel.txt').replace(' ','_')
        )
        pathway_df = pd.read_table(
            pathway,
            names=[header],
            skiprows=1,
            dtype={header:str}
        )
        transcript2kegg = pd.merge(
            pathway_df, 
            annotation_df, 
            how='left', 
            left_on=[os.path.basename(pathway)],
            right_on=['KEGG Orthology ID']
        ).fillna('-')
        transcript2kegg['seq'] = transcript2kegg.apply(get_seq,axis=1)
        transcript2kegg.to_csv(out_file, sep='\t', index=False)
        progress.update(1)
create_pathway_lists_and_seq(kegg_pathways, merged_with_ko)

0


In [26]:
transcript2kegg

NameError: name 'transcript2kegg' is not defined