In [1]:
from pyteomics import *
from Bio import SeqIO
from pyteomics.mass import *
from periodictable import H,C,N,O,S
import pandas as pd

In [2]:

def get_stoich_antioxi(file_location,
                      protein_name):
    '''
    This function goes through each protein in the fasta file, calculates its stoichiometry, and
    outputs a dataframe with a bunch of characteristics
    '''
    # get the mass per atom
    mass_dict = {'H': H.mass, 'C': C.mass, 'N': N.mass, 'O': O.mass, 'S':S.mass}

    stoichio_dicts_atoms = []
    stoichio_dicts = []
    seq_vals = []
    seq_len = []
    seq_id = []

    # got through each sequence in a given file
    for seq_record in SeqIO.parse(file_location,
                                 'fasta'):
        prot_sequence = str(seq_record.seq)
#         prot_composition = Composition(prot_sequence.replace('*', ''))
        prot_composition_atoms = Composition(prot_sequence.replace('*', ''))

#         # go through counting each stoich and then get the mass composition
#         for key_i in blank_composition:
#             prot_composition[key_i] *= mass_dict[key_i]
            
        stoichio_dicts_atoms.append(prot_composition_atoms)
#         stoichio_dicts.append(prot_composition)
        seq_vals.append(prot_sequence)
        seq_len.append(len(prot_sequence))
        seq_id.append(seq_record.name)

    df_test = pd.DataFrame(stoichio_dicts_atoms)
    df_test['Gene_ID'] = seq_id
    df_test['Sequence_Length'] = seq_len
    df_test['Sequence'] = seq_vals
    df_test['prot_name'] = protein_name
    
    return df_test

def get_multiple_antiox(file_location_list,
                        protein_name_list):
    
    '''
    loops through multiple proteins to get stoich, using above function
    '''
    
    df_original = get_stoich_antioxi(file_location=file_location_list[0],
                                     protein_name = protein_name_list[0])
    for i in range(len(file_location_list) - 1):
        print(i + 1)
        print(file_location_list[i + 1])
        print(protein_name_list[i + 1])
        df_temp = get_stoich_antioxi(file_location=file_location_list[i + 1],
                                     protein_name = protein_name_list[i + 1])
        df_original = df_original.append(df_temp)
        
    return df_original
    

In [3]:
# df1 = cyto_test = get_stoich_antioxi('../data/antiox_gene_name_lists/cyto_c_gene_id.fasta',
#                       protein_name = 'cytochrome')
# df2 = get_stoich_antioxi('../data/antiox_gene_name_lists/cat_gene_id.fasta',
#                       protein_name = 'catalase')

# df3 = df1.append(df2)
# range(len(['../data/antiox_gene_name_lists/sod_gene_id.fasta',
#                                          '../data/antiox_gene_name_lists/APX_gene_id.fasta',
#                                          '../data/antiox_gene_name_lists/cat_gene_id.fasta',
#                                          '../data/antiox_gene_name_lists/gpx_gene_id.fasta',
#                                          '../data/antiox_gene_name_lists/cyto_c_gene_id.fasta',
#                                          '../data/antiox_gene_name_lists/prx_gene_id.fasta']) - 1)

>tr|B6DMH6|B6DMH6_9STRA Ferritin (Fragment) OS=Pseudo-nitzschia multiseries OX=37319 GN=FTN PE=1 SV=1
MKSPFFFLSALALTLRDSSPSFATAFRLAVTRCARQGIHAPSSSSSSSSRCLVASASALA
GPSEELLDLFNRQVTQEFTASQVYLSASIWFDQNDWEGMAAYMLAESAEEREHGLGFVDF
ANKRNIPIELQAVPAPVSCAEWSSPEDVWQSILELEQANTRSLLNLAEAASTCHDFAVMA
FLNPFHLQQVNEEDKIGSILAKVTDENRTPGLLRSLDVVSFLGPCLFRSV

In [6]:
all_antiox_stoich = get_multiple_antiox(file_location_list = ['../data/antiox_gene_name_lists/sod_gene_id.fasta',
                                         '../data/antiox_gene_name_lists/APX_gene_id.fasta',
                                         '../data/antiox_gene_name_lists/cat_gene_id.fasta',
                                         '../data/antiox_gene_name_lists/gpx_gene_id.fasta',
                                         '../data/antiox_gene_name_lists/cyto_c_gene_id.fasta',
                                         '../data/antiox_gene_name_lists/prx_gene_id.fasta',
                                                             '../data/pseudonitzschia_ferritin.fasta'],
                        protein_name_list = ['SOD',
                                            'APX',
                                            'CAT',
                                            'GPX',
                                            'Cyto C',
                                            'Prx',
                                            'Ferritin'])

1
../data/antiox_gene_name_lists/APX_gene_id.fasta
APX
2
../data/antiox_gene_name_lists/cat_gene_id.fasta
CAT
3
../data/antiox_gene_name_lists/gpx_gene_id.fasta
GPX
4
../data/antiox_gene_name_lists/cyto_c_gene_id.fasta
Cyto C
5
../data/antiox_gene_name_lists/prx_gene_id.fasta
Prx
6
../data/pseudonitzschia_ferritin.fasta
Ferritin


In [7]:
all_antiox_stoich.to_csv("../data/tara_stoichiometry_antiox.csv")

echo
