In [57]:
from pyteomics import *
from Bio import SeqIO
from pyteomics.mass import *
from periodictable import H,C,N,O,S
import pandas as pd

frag_ccp = [159013, 177704, 209911, 278813]
frag_apx = [152257, 173923, 201562, 222358, 249103, 258192, 261588, 278270]
frag_catalase = [235174, 247027, 249423]
frag_cuznsod = [235627, 235657, 269494, 273091]
frag_nisod = [267154, 273517]
frag_mnfesod = [185706, 203256, 239458]

phae_ccp = [13174, 13244, 1023]
phae_apx = [18572, 46616, 47395, 54731, 1645]
phae_cat = [22418]
phae_cuznsod = [15852, 47310]
phae_mnfesod = [12583, 42832]

phae_all_names = ["CCP", "CCP", "CCP", "APX", "APX", "APX", "APX", "APX", "CAT", "CuZnSOD", "CuZnSOD", "MnFeSOD", "MnFeSOD"]
frag_all_names = ["CCP", "CCP", "CCP", "CCP", "APX", "APX", "APX", "APX", "APX", "APX", "APX", "APX", "CAT", "CAT", "CAT", "CuZnSOD", "CuZnSOD", "CuZnSOD", "CuZnSOD", "NiSOD", "NiSOD", "MnFeSOD", "MnFeSOD", "MnFeSOD"]

phae_all = phae_ccp + phae_apx + phae_cat + phae_cuznsod + phae_mnfesod
frag_all = frag_ccp + frag_apx + frag_catalase + frag_cuznsod + frag_nisod + frag_mnfesod

phae_all_str = ["|" + str(i) + "|" for i in phae_all]
frag_all_str = ["|" + str(i) + "|" for i in frag_all]

In [64]:

def make_summary_antiox_table(prot_list, prot_name_list, file_location):

    '''
    goes into a fasta file, gets the antioxidant names with the corresponding gene ID numbers, and
    returns a dataframe of the antioxidant name (supplied), and the sequence length
    '''
    
    collected_seqs = []
    collected_full_prot_names = []
    collected_prot_names = []
    collected_sequence_lengths = []

    for seq_record in SeqIO.parse(file_location, 'fasta'):
        prot_sequence = str(seq_record.seq)
        prot_name = seq_record.name

        in_or_none = [string_i for string_i in prot_list if(string_i in prot_name)]

        if len(in_or_none) > 0:
            
            collected_seqs.append(prot_sequence)
            collected_full_prot_names.append(prot_name)
            collected_sequence_lengths.append(len(prot_sequence))

            for i in range(len(prot_list)):
                if prot_list[i] == in_or_none[0]:
                    collected_prot_names.append(prot_name_list[i])
                    
    overall_lists = [collected_full_prot_names, 
                     collected_prot_names, 
                     collected_sequence_lengths, 
                     collected_seqs]
    
    overall_df = pd.DataFrame(overall_lists)
    overall_df_t = overall_df.transpose()
    overall_df_t.columns = ['gene_id', 'anti', 'sequence_lengths', 'sequences']
    
    return(overall_df_t)

In [65]:
phae_summary = make_summary_antiox_table(prot_list = phae_all_str, prot_name_list = phae_all_names, 
                           file_location = "../data/protein_expression_data/phytoplankton_genomes/phaeo_genome/Phatr2_chromosomes_geneModels_FilteredModels2_cat_unmapped_GeneModels_FilteredModels1_aa.fasta")
frag_summary = make_summary_antiox_table(prot_list = frag_all_str, prot_name_list = frag_all_names, 
                          file_location = "../data/protein_expression_data/phytoplankton_genomes/frag_genome/Fracy1_GeneModels_FilteredModels1_aa.fasta")


In [66]:
# frag_all_str
frag_summary

Unnamed: 0,gene_id,anti,sequence_lengths,sequences
0,jgi|Fracy1|235174|fgenesh2_pg.2_#_1282,CAT,1055,MIYIHIHIGTGTGIVIGIGIVIGIGIGTDTDTGIGIGIGIGIGTGI...
1,jgi|Fracy1|209911|estExt_Genewise1Plus.C_90733,CCP,324,MIFNYARFANRLAAPAVFGSTLALYSLQDVHAKEATVDMNKVRDAI...
2,jgi|Fracy1|185706|e_gw1.6.866.1,MnFeSOD,232,MSKTLAVGTALVVPALANAYELPDLPYPFEALEPFIDAPTMKIHHD...
3,jgi|Fracy1|239458|fgenesh2_pg.6_#_860,MnFeSOD,223,MSGGKIILDKALRSAVNLSSSKLPSEVAKSSAKLFSALSGTKFVLP...
4,jgi|Fracy1|177704|e_gw1.1.1902.1,CCP,272,MTYDVDAVRQEIRSLLNNPSWDDGSLAPVFLRLAWHSSGTYDAASG...
5,jgi|Fracy1|267154|estExt_fgenesh2_kg.C_10134,NiSOD,206,MQIFVKTLTGKTITLDVEPSDTIDNVKTKIQDKEGIPPDQQRLIFA...
6,jgi|Fracy1|247027|fgenesh2_pg.19_#_200,CAT,1105,MDGNGDDDDNYKAIANNNNDSYFVNLCDDNDNDNEEESWEQKQLKR...
7,jgi|Fracy1|261588|estExt_fgenesh2_pg.C_70253,APX,726,MADQSVKKCPFLHPGNGTTNSHWWKNNLNIKILNQNNERTDPMPSD...
8,jgi|Fracy1|269494|estExt_fgenesh2_kg.C_70325,CuZnSOD,163,MATCVCVFISEGSSGVTGSISLVQNQEDSPTVIEGQLRGLTPNQRH...
9,jgi|Fracy1|278270|estExt_fgenesh2_pm.C_490021,APX,688,MPSDFNYATEFGKLDLASLKNDLTTLMTDSQDFWPADYGHYGPFFI...


In [67]:
phae_summary.to_csv('../data/protein_expression_data/phae_anti_prot_lengths.csv')
frag_summary.to_csv('../data/protein_expression_data/frag_anti_prot_lengths.csv')