In [31]:
import os
import pathlib
import pprint
from Bio import SeqIO
import gffutils
import pandas

In [19]:
def parse_gff3(gff_path):
    contigs = set()
    locus_prefixes = set()
    id_prefixes = set()

    with pathlib.Path(gff_path).open() as fh:
        for line in fh:
            if not line.strip() or line.startswith("#"):
                continue

            parts = line.rstrip("\n").split("\t")
            if len(parts) < 9:
                continue

            seqid, feature_type, attributes = parts[0], parts[2], parts[8]
            contigs.add(seqid)

            if feature_type != "CDS":
                continue

            # parse attributes into a dict
            attrs = {}
            for kv in attributes.split(";"):
                if "=" in kv:
                    key, val = kv.split("=", 1)
                    attrs[key] = val

            if "locus_tag" in attrs:
                lt = attrs["locus_tag"]
                locus_prefixes.add(lt.split("_", 1)[0])

            if "ID" in attrs:
                idv = attrs["ID"]
                id_prefixes.add(idv.split("_", 1)[0])

    return sorted(contigs), sorted(locus_prefixes), sorted(id_prefixes)



In [58]:
annotations_dir = pathlib.Path("fixed_annotations")

In [59]:
unclean_gffs = [f for f in annotations_dir.glob("*.gff3")]

In [5]:
unclean_gffs[0]

PosixPath('fixed_annotations/URI44H.gff3')

In [36]:
locus_tag_prefix_to_BSID = 'locus_tag_prefixes.txt'
name_to_BSID = 'biosample_to_name.txt'
ltp2bs_df = pandas.read_csv(locus_tag_prefix_to_BSID, sep='\t')
n2bs_df = pandas.read_csv(name_to_BSID, sep='\t')

In [47]:
m_df = pandas.merge(
    left=n2bs_df,
    right=ltp2bs_df,
    how='left',
    on='BioSample'
)

[nan, nan, nan, nan, nan, 'FIT27', 'FIT28', 'FIT34', 'FIT48', 'FIS58', 'FIS68', 'FIS67', 'FIS72', 'FIS81', 'FIS78', 'FIT30', 'FIT42', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'FIT14', 'FIT15', 'FIT16', 'FIT24', 'FIT32', 'FIT33', 'FIT40', 'FIT41', 'FIT44', 'FIS73', 'FIS79', 'FIS82', 'FIS59', 'FIS60', 'FIS63', 'FIS64', 'FIS69', 'FIS74', 'FIS75', 'FIS76', 'FIS87', 'FIT13', 'FIT17', 'FIT20', 'FIT22', 'FIT29', 'FIT31', nan, nan, nan, nan, nan]


In [86]:
m_df = m_df.reset_index(drop=True)
m_df.to_csv('locus_tag_to_asm_name.tsv', sep='\t', index=False)

In [87]:
locus_tag_dict = {'isolate': [], 'old': [], 'new': []}
for idx, row in m_df.iterrows():
    locus_tag_dict['isolate'].append(row['isolate'])
    locus_tag_dict['new'].append(row.get('LocusTagPrefix', 'NotFound'))

In [88]:
for gff in unclean_gffs:
    contigs, locus_pref, id_pref = parse_gff3(gff)
    old_prefix = list(set(locus_pref))[0]
    locus_tag_dict['old'].append(old_prefix)

In [89]:
lt_df = pandas.DataFrame(locus_tag_dict)

In [90]:
lt_df = lt_df.reset_index(drop=True)

In [91]:
lt_df.to_csv('locus_tag_map.tsv', sep='\t', index=False)