# From the fasta and clustered fasta gene sequences, return a list of genes that are interesting to the study.

In [18]:
import pandas as pd
import numpy as np
import os
import glob
from Bio import SeqIO

In [30]:
wd = '/home/bay001/projects/kes_20160307/org/03_output/assembly/gene_sequences'
all_genes = glob.glob(os.path.join(wd,'*.clustered.fasta'))

In [31]:
genelist = []
for gene in all_genes:
    genelist.append(os.path.basename(gene).replace('.clustered.fasta',''))

In [32]:
df = pd.DataFrame(genelist)
df.columns = ['Gene ID']
ensembl2gene = pd.read_table('/home/bay001/projects/kes_20160307/org/00_data/references/biomart/galgal4_biomart.txt').fillna('NONAME')

merged = pd.merge(df,ensembl2gene,how='left',on=['Gene ID'])
merged.to_csv('/home/bay001/projects/kes_20160307/data/ensembl-genename-description.tsv',sep='\t',index=None)
merged

Unnamed: 0,Gene ID,Description,Associated Gene Name
0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1
1,ENSGALG00000027245,glyoxalase domain containing 4 [Source:HGNC Sy...,GLOD4
2,ENSGALG00000025732,Uncharacterized protein [Source:UniProtKB/TrE...,NONAME
3,ENSGALG00000027762,post-GPI attachment to proteins 2 [Source:HGNC...,PGAP2
4,ENSGALG00000028148,PHD finger protein 13 [Source:HGNC Symbol;Acc:...,PHF13
5,ENSGALG00000012623,"Gallus gallus cripto, FRL-1, cryptic family 1B...",CFC1
6,ENSGALG00000023842,Uncharacterized protein [Source:UniProtKB/TrE...,NONAME
7,ENSGALG00000009039,"neural proliferation, differentiation and cont...",NPDC1
8,ENSGALG00000006613,"Gallus gallus cathepsin D (CTSD), mRNA. [Sourc...",CTSD
9,ENSGALG00000004643,Gallus gallus protein-L-isoaspartate (D-aspart...,NONAME


# Look at the genes of interest
- Deiodinase 1 and 2, 
- thyroid stimulating hormone (TSH), (TSHB, TSHZ2, TSHZ1, TSHZ3, 
- EEF1A1, 
- ALB (Albumin), 
- HPRT1, 
- TATA-Box Binding Protein (TBP), 
- YWHAZ, 
- PPIA

In [33]:
goi = ['ENSGALG00000010736','ENSGALG00000026005','ENSGALG00000002550','ENSGALG00000007809',
       'ENSGALG00000013672','ENSGALG00000004528','ENSGALG00000015917','ENSGALG00000020180',
      'ENSGALG00000006098','ENSGALG00000011171','ENSGALG00000010725','ENSGALG00000028600']
goi = pd.DataFrame(goi)
goi.columns = ['Gene ID']
merged2 = pd.merge(goi,merged,how='left',on='Gene ID')
merged2.to_csv('/home/bay001/projects/kes_20160307/data/ensembl-genename-description-goi.tsv',sep='\t',index=None)

In [34]:
merged

Unnamed: 0,Gene ID,Description,Associated Gene Name
0,ENSGALG00000012304,Gallus gallus nuclear factor of kappa light po...,NFKB1
1,ENSGALG00000027245,glyoxalase domain containing 4 [Source:HGNC Sy...,GLOD4
2,ENSGALG00000025732,Uncharacterized protein [Source:UniProtKB/TrE...,NONAME
3,ENSGALG00000027762,post-GPI attachment to proteins 2 [Source:HGNC...,PGAP2
4,ENSGALG00000028148,PHD finger protein 13 [Source:HGNC Symbol;Acc:...,PHF13
5,ENSGALG00000012623,"Gallus gallus cripto, FRL-1, cryptic family 1B...",CFC1
6,ENSGALG00000023842,Uncharacterized protein [Source:UniProtKB/TrE...,NONAME
7,ENSGALG00000009039,"neural proliferation, differentiation and cont...",NPDC1
8,ENSGALG00000006613,"Gallus gallus cathepsin D (CTSD), mRNA. [Sourc...",CTSD
9,ENSGALG00000004643,Gallus gallus protein-L-isoaspartate (D-aspart...,NONAME


In [35]:

def get_seq(df):
    records = {}
    i = 0
    for gene in all_genes:
        ensg = os.path.basename(gene).replace('.clustered.fasta','')
        if ensg in list(df['Gene ID']):
            gene_name = df[(df['Gene ID'].str.contains(ensg))]['Associated Gene Name'].to_string(index=False)
            handle = open(gene, "rU")
            for record in SeqIO.parse(handle,"fasta"):
                records[i] = (ensg, gene_name, record.id)
                i = i + 1
    df = pd.DataFrame(records).T
    df.columns = ['ensembl','name','contig_name']
    return df

In [36]:
get_seq(merged).to_csv('/home/bay001/projects/kes_20160307/org/03_output/csvs/contig_and_gene_name.txt',sep='\t',index=None)

In [37]:
! head /home/bay001/projects/kes_20160307/org/03_output/csvs/contig_and_gene_name.txt

ensembl	name	contig_name
ENSGALG00000012304	NFKB1	unmapped-49-contig_list_contig_33037-0
ENSGALG00000012304	NFKB1	EC-4AK111_TAGCTT_R1_(paired)_contig_2427-0
ENSGALG00000012304	NFKB1	unmapped-49-contig_list_contig_117584-0
ENSGALG00000012304	NFKB1	unmapped-49-contig_list_contig_288882-0
ENSGALG00000012304	NFKB1	unmapped-49-contig_list_contig_35316-0
ENSGALG00000027245	GLOD4	unmapped-49-contig_list_contig_44777-0
ENSGALG00000027245	GLOD4	EC-4AK111_TAGCTT_R1_(paired)_contig_2228-0
ENSGALG00000027245	GLOD4	unmapped-49-contig_list_contig_82939-0
ENSGALG00000025732	NONAME	unmapped-49-contig_list_contig_87832-0


In [None]:
all_genes[:5]