In [6]:
import pandas as pd

In [7]:
def parse_dat_file(file_path):
    with open(file_path, 'r') as file:
        data = []
        record = {'EC_ID': None,  'EC_AN': None, 'EC_DE': None, 'EC_CA': [], 'EC_CC': []}
        cc_line = ''
        ca_line = ''
        for line in file:
            if line.startswith('ID'):
                record['EC_ID'] = line[3:].strip()
            elif line.startswith('DE'):
                record['EC_DE'] = line[3:].strip()
            elif line.startswith('AN'):
                record['EC_AN'] = line[3:].strip()
            elif line.startswith('CC'):
                # Check if this line starts a new CC field
                if line.startswith('CC   -!- '):
                    # If there's a previous CC field, add it to the record
                    if cc_line:
                        record['EC_CC'].append(cc_line)
                    # Start a new CC field
                    cc_line = line[9:].strip()
                else:
                    # If this line continues the current CC field, append it
                    cc_line += ' ' + line[3:].strip()
            elif line.startswith('CA'):
                # Check if this line starts a new CA field
                if line.startswith('CA   -!- '):
                    # If there's a previous CA field, add it to the record
                    if ca_line:
                        record['EC_CA'].append(ca_line)
                    # Start a new CA field
                    ca_line = line[9:].strip()
                else:
                    # If this line continues the current CA field, append it
                    ca_line += ' ' + line[3:].strip()
            elif line.startswith('//'):
                # End of record, add the last CC and CA field to the record
                if cc_line:
                    record['EC_CC'].append(cc_line)
                if ca_line:
                    record['EC_CA'].append(ca_line)
                # Add the record to data
                data.append(record)
                # Start a new record
                record = {'EC_ID': None,  'EC_AN': None, 'EC_DE': None, 'EC_CA': [], 'EC_CC': []}
                cc_line = ''
                ca_line = ''
        df = pd.DataFrame(data)
        # Join the 'EC_CC' and 'EC_CA' entries with '-!-' as the separator
        df['EC_CC'] = df['EC_CC'].apply(lambda x: '-!-'.join(x))
        df['EC_CA'] = df['EC_CA'].apply(lambda x: '-!-'.join(x))
        return df


df = parse_dat_file('enzyme.dat')
# Replace empty strings with '-'
df.replace('', '-', inplace=True)
df.fillna('-', inplace=True)
# drop the row EC_ID == '-'
df = df[df['EC_ID'] != '-']
df.set_index('EC_ID', inplace=True)
df.head()


Unnamed: 0_level_0,EC_AN,EC_DE,EC_CA,EC_CC
EC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.1.1.1,aldehyde reductase.,alcohol dehydrogenase.,(1) a primary alcohol + NAD(+) = an aldehyde ...,Acts on primary or secondary alcohols or hemi-...
1.1.1.2,aldehyde reductase (NADPH).,alcohol dehydrogenase (NADP(+)).,a primary alcohol + NADP(+) = an aldehyde + H...,Some members of this group oxidize only primar...
1.1.1.3,-,homoserine dehydrogenase.,(1) L-homoserine + NAD(+) = H(+) + L-aspartat...,The enzyme from Saccharomyces cerevisiae acts ...
1.1.1.4,diacetyl (acetoin) reductase.,"(R,R)-butanediol dehydrogenase.","(R,R)-butane-2,3-diol + NAD(+) = (R)-acetoin ...",Also converts diacetyl into acetoin with NADH ...
1.1.1.5,-,Transferred entry: 1.1.1.303 and 1.1.1.304.,-,-


In [8]:
df.to_csv('enzyme.tsv', sep='\t', index=False)

In [9]:
# save df to dict and the first column as key, the rest as value
import pandas as pd

ec_dict = df.to_dict(orient='index')



In [10]:
ec_dict['4.1.1.3']

{'EC_AN': '-',
 'EC_DE': 'Transferred entry: 4.1.1.112 and 7.2.4.2.',
 'EC_CA': '-',
 'EC_CC': '-'}

In [11]:
ec_dict['4.1.1.112']

{'EC_AN': 'oxaloacetate carboxy-lyase.',
 'EC_DE': 'oxaloacetate decarboxylase.',
 'EC_CA': ' H(+) + oxaloacetate = CO2 + pyruvate.',
 'EC_CC': "The enzymes from the fish Gadus morhua (Atlantic cod) and the bacterium Micrococcus luteus prefer Mn(2+), while those from the bacteria Pseudomonas putida and Pseudomonas aeruginosa prefer Mg(2+).-!-Unlike EC 7.2.4.2, there is no evidence of the enzyme's involvement in Na(+) transport.-!-Formerly EC 4.1.1.3."}

In [12]:
ec_dict['7.2.4.2']

{'EC_AN': 'oxaloacetate beta-decarboxylase.',
 'EC_DE': 'oxaloacetate decarboxylase (Na(+) extruding).',
 'EC_CA': ' H(+) + 2 Na(+)(in) + oxaloacetate = CO2 + 2 Na(+)(out) + pyruvate.',
 'EC_CC': 'The enzyme from Klebsiella aerogenes is a biotinyl protein and also decarboxylates glutaconyl-CoA and methylmalonyl-CoA.-!-The process is accompanied by the extrusion of two sodium ions from cells.-!-Some animal enzymes require Mn(2+).-!-Differs from EC 4.1.1.112 (oxaloacetate decarboxylase) for which there is no evidence for involvement in Na(+) transport.-!-Formerly EC 4.1.1.3.'}

In [13]:
df2 = pd.read_csv('C:/Users/max/Desktop/MetaX_Suite/taxafunc_anno_knn.tsv', sep='\t')
display(df2.head())
display(df2.shape)

Unnamed: 0,Sequence,Proteins,LCA_level,Taxon,Taxon_prop,eggNOG_OGs,eggNOG_OGs_prop,max_annot_lvl,max_annot_lvl_prop,COG_category,...,ser_H_3,ser_L_1,ser_L_2,ser_L_4,vlf_H_1,vlf_H_2,vlf_H_3,vlf_L_1,vlf_L_2,vlf_L_3
0,(3sulfo)ADLPFLMPIEDVFTITGR,MGYG000002102_00092,species,d__Bacteria|p__Firmicutes_A|c__Clostridia|o__O...,1.0,"COG0050@1|root,COG0050@2|Bacteria,1TPKC@1239|F...",1.0,186801|Clostridia,1.0,J,...,0.0,2007352.0,1273643.0,114212.8,0.0,0.0,0.0,0.0,0.0,0.0
1,(3sulfo)DMVDDEELIDLVEMETR,MGYG000004523_00251;MGYG000003361_00615;MGYG00...,domain,d__Bacteria,1.0,"COG0050@1|root,COG0050@2|Bacteria,2GK4T@201174...",0.75,84998|Coriobacteriia,0.75,J,...,824764.0,680870.0,1019364.0,1289167.2,231216.0,360142.0,474562.0,299869.0,97449.0,641959.4
2,(3sulfo)LLESGPSAIVDITNEQQIAETVSK,MGYG000000044_00200,species,d__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,1.0,"COG0451@1|root,COG0451@2|Bacteria,4NE3U@976|Ba...",1.0,976|Bacteroidetes,1.0,GM,...,981461.0,466142.2,664142.0,341779.0,1252034.0,1440497.0,901770.0,1603674.0,1713587.0,781883.0
3,(3sulfo)PAAAAVAVAAGPAAGGAAAAEEKSSFDVVLAEVGGAK,MGYG000002080_00533;MGYG000004456_02011,genus,d__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,1.0,"COG0222@1|root,COG0222@2|Bacteria,4NQAQ@976|Ba...",1.0,976|Bacteroidetes,1.0,J,...,0.0,0.0,0.0,0.0,751447.6,0.0,0.0,2042882.0,836183.0,42615.0
4,(3sulfo)PDPARESDKPFLM(Oxidation)PVEDVFTISGR,MGYG000003253_01088,species,d__Bacteria|p__Firmicutes_A|c__Clostridia_A|o_...,1.0,"COG0050@1|root,COG0050@2|Bacteria,1TPKC@1239|F...",1.0,186801|Clostridia,1.0,J,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(162635, 113)

In [17]:
import pandas as pd

def lookup_and_join(ec_nums, column_name):
    # For each EC number, lookup the corresponding value in the given column of df
    values = []
    for ec_num in ec_nums:
        res = ec_dict[ec_num][column_name]
        if res.startswith('Transferred entry:'):
            new_ec_nums = res.split(':')[1].strip('.').replace(' ', '').split('and')
            ec_nums.extend(new_ec_nums)
            # Lookup and join the corresponding values, and store the result in the new column
            res = lookup_and_join(new_ec_nums, column_name)
            values.append(res)
            # Just append the "Transferred entry" text instead of looking up again
            # values.append('Transferred entry: ' + ', '.join(new_ec_nums))
        else:
            values.append(res)

    # Join the values with the " | " separator except all values are "-"
    if all(value == '-' for value in values):
        return '-'
    else:
        # Remove duplicates
        values = list(dict.fromkeys(values))
        return ' | '.join(values)


# Create a mask for rows where 'EC' is not "unknown"
mask_EC = df2['EC'] != 'unknown'

# For each row in df2 where 'EC' is not "unknown"
for i, row in df2[mask_EC].iterrows():
    # Split the 'EC' value into multiple EC numbers
    ec_nums = row['EC'].split(',')
    # And for each column to be added
    for column_name in ['EC_DE', 'EC_AN', 'EC_CC', 'EC_CA']:
        # Lookup and join the corresponding values, and store the result in the new column
        df2.at[i, column_name] = lookup_and_join(ec_nums, column_name)

# For rows where 'EC' is "unknown", set the new columns' values to "-"
df2.loc[~mask_EC, ['EC_DE', 'EC_AN', 'EC_CC', 'EC_CA']] = '-'

# Set the new '_prop' columns' values to the values in the 'EC_prop' column
df2['EC_DE_prop'] = df2['EC_prop']
df2['EC_AN_prop'] = df2['EC_prop']
df2['EC_CC_prop'] = df2['EC_prop']
df2['EC_CA_prop'] = df2['EC_prop']
df2.fillna('-', inplace=True)
df2.replace('', '-', inplace=True)


df2.head()


Unnamed: 0,Sequence,Proteins,LCA_level,Taxon,Taxon_prop,eggNOG_OGs,eggNOG_OGs_prop,max_annot_lvl,max_annot_lvl_prop,COG_category,...,vlf_L_2,vlf_L_3,EC_DE,EC_AN,EC_CC,EC_CA,EC_DE_prop,EC_AN_prop,EC_CC_prop,EC_CA_prop
0,(3sulfo)ADLPFLMPIEDVFTITGR,MGYG000002102_00092,species,d__Bacteria|p__Firmicutes_A|c__Clostridia|o__O...,1.0,"COG0050@1|root,COG0050@2|Bacteria,1TPKC@1239|F...",1.0,186801|Clostridia,1.0,J,...,0.0,0.0,-,-,-,-,1.0,1.0,1.0,1.0
1,(3sulfo)DMVDDEELIDLVEMETR,MGYG000004523_00251;MGYG000003361_00615;MGYG00...,domain,d__Bacteria,1.0,"COG0050@1|root,COG0050@2|Bacteria,2GK4T@201174...",0.75,84998|Coriobacteriia,0.75,J,...,97449.0,641959.4,-,-,-,-,1.0,1.0,1.0,1.0
2,(3sulfo)LLESGPSAIVDITNEQQIAETVSK,MGYG000000044_00200,species,d__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,1.0,"COG0451@1|root,COG0451@2|Bacteria,4NE3U@976|Ba...",1.0,976|Bacteroidetes,1.0,GM,...,1713587.0,781883.0,-,-,-,-,1.0,1.0,1.0,1.0
3,(3sulfo)PAAAAVAVAAGPAAGGAAAAEEKSSFDVVLAEVGGAK,MGYG000002080_00533;MGYG000004456_02011,genus,d__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,1.0,"COG0222@1|root,COG0222@2|Bacteria,4NQAQ@976|Ba...",1.0,976|Bacteroidetes,1.0,J,...,836183.0,42615.0,-,-,-,-,1.0,1.0,1.0,1.0
4,(3sulfo)PDPARESDKPFLM(Oxidation)PVEDVFTISGR,MGYG000003253_01088,species,d__Bacteria|p__Firmicutes_A|c__Clostridia_A|o_...,1.0,"COG0050@1|root,COG0050@2|Bacteria,1TPKC@1239|F...",1.0,186801|Clostridia,1.0,J,...,0.0,0.0,-,-,-,-,1.0,1.0,1.0,1.0


In [15]:
display(df2.shape)

(162635, 121)

In [18]:
df2.to_csv("C:/Users/max/Desktop/MetaX_Suite/3.tsv", sep="\t", index=False)