In [1]:
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation

record = SeqIO.read("MPMM01_MPMM_027.gbk", "genbank")

gd_diagram = GenomeDiagram.Diagram(record.id)
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
gd_feature_set = gd_track_for_features.new_set()

for feature in record.features:
    if feature.type != "gene":
        #Exclude this feature
        continue
    if len(gd_feature_set) % 2 == 0:
        color = colors.blue
    else:
        color = colors.lightblue
    gd_feature_set.add_feature(feature, sigil="ARROW",
                               color=color, label=True,
                               label_size = 14, label_angle=0)

#I want to include some strandless features, so for an example
#will use EcoRI recognition sites etc.
for site, name, color in [("GAATTC","EcoRI",colors.green),
                          ("CCCGGG","SmaI",colors.orange),
                          ("AAGCTT","HindIII",colors.red),
                          ("GGATCC","BamHI",colors.purple)]:
    index = 0
    while True:
        index  = record.seq.find(site, start=index)
        if index == -1 : break
        feature = SeqFeature(FeatureLocation(index, index+len(site)))
        gd_feature_set.add_feature(feature, color=color, name=name,
                                   label=True, label_size = 10,
                                   label_color=color)
        index += len(site)

gd_diagram.draw(format="linear", pagesize='A4', fragments=4,
                start=0, end=len(record))
gd_diagram.write("/home/minion/projects/genebanks/plasmid_linear_nice.pdf", "PDF")
gd_diagram.write("/home/minion/projects/genebanks/plasmid_linear_nice.eps", "EPS")
gd_diagram.write("/home/minion/projects/genebanks/plasmid_linear_nice.svg", "SVG")

gd_diagram.draw(format="circular", circular=True, pagesize=(20*cm,20*cm),
                start=0, end=len(record), circle_core = 0.5)
gd_diagram.write("/home/minion/projects/genebanks/plasmid_circular_nice.pdf", "PDF")
gd_diagram.write("/home/minion/projects/genebanks/plasmid_circular_nice.eps", "EPS")
gd_diagram.write("/home/minion/projects/genebanks/plasmid_circular_nice.svg", "SVG")

In [None]:
http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc262
http://wilkelab.org/classes/SDS348/2015_spring_discussions/discussion_4.1.15.html

In [None]:
# Import the required components of BioPython
# Entrez allows us to collect data from NCBI
# SeqIO allows us to input and output sequence files
from Bio import Entrez, SeqIO

Entrez.email = "agb214@exeter.ac.uk" # define email to keep the NIH happy

# NCBI ID for human hemoglobin
hemo_id = "NM_000558.4"

# Fetch a record from NCBI
hemo_handle = Entrez.efetch(db="nucleotide", id=hemo_id, rettype="gb", retmode="text")

# Parse the handle
hemo_record = SeqIO.read(hemo_handle, "genbank")

# Use a print statement (!!!) to make sure we actually fetched the record properly
print(hemo_record)

In [None]:
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
import os
directory = '/home/minion/projects/genebanks/viral_contigs/gbk_files'

for filename in os.listdir(directory):
    if filename.endswith(".gbk"):
        new_name=os.path.splitext(filename)[0]
        print(filename)
        record = SeqIO.read(filename, "genbank") 
        gd_diagram = GenomeDiagram.Diagram(new_name)
        gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
        gd_feature_set = gd_track_for_features.new_set()
        
        for feature in record.features:
            if feature.type != "gene":
                continue
            if len(gd_feature_set) % 2 == 0:
                color = colors.blue
            else:
                color = colors.lightblue
            gd_feature_set.add_feature(feature, sigil="ARROW", color=color, label=True, label_size = 10)
            gd_diagram.draw(format="linear", orientation="landscape", pagesize='A1',fragments=25, start=0, end=200000)
            gd_diagram.write(new_name+".pdf", "PDF")

In [None]:
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""GenomeDiagram script to mimic Proux et al 2002 Figure 6.

You can use the Entrez module to download the 3 required GenBank files

This is an extended version of the example in the Biopython Tutorial
which produces a GenomeDiagram figure close to Proux et al 2002 Figure 6.

See https://doi.org/10.1128/JB.184.21.6026-6036.2002
"""
from reportlab.lib import colors
from reportlab.lib.colors import red, grey, orange, green, brown
from reportlab.lib.colors import blue, lightblue, purple

from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink

from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation

name = "Proux Fig 6"

# As explained in the Biopython Tutorial, these are three phage genomes. The first
# two are self-containged GenBank files, but the third phage is integrated into a
# bacterial genome, thus we slice the full record and also take the reverse
# complement to match the strand orientation of the other two phage:

A_rec = SeqIO.read("NC_002703.gbk", "gb")
B_rec = SeqIO.read("AF323668.gbk", "gb")
C_rec = SeqIO.read("NC_003212.gbk",
                   "gb")[2587879:2625807].reverse_complement(name=True)
records = dict((rec.name, rec) for rec in [A_rec, B_rec, C_rec])

# Here we hard code the gene colors for simiplicity and to match the target image.
# In practice you might have an automatic mapping based on the gene annotation
# or some other classification:

A_colors = [red] * 5 + [grey] * 7 + [orange] * 2 + [grey] * 2 + [orange] \
    + [grey] * 11 + [green] * 4 + [grey] + [green] * 2 + [grey, green] \
    + [brown] * 5 + [blue] * 4 + [lightblue] * 5 + [grey, lightblue] \
    + [purple] * 2 + [grey]
B_colors = [red] * 6 + [grey] * 8 + [orange] * 2 + [grey] + [orange] \
    + [grey] * 21 + [green] * 5 + [grey] + [brown] * 4 + [blue] * 3 \
    + [lightblue] * 3 + [grey] * 5 + [purple] * 2
C_colors = [grey] * 33 + [green] * 5 + [brown] * 4 + [blue] * 2 \
    + [grey, blue] + [lightblue] * 2 + [grey] * 8

# Here we hard code a list of cross-links with percentage identity scores, based
# on a manual inspection of the target image (there could be mistakes here).
# In practice you might generate the list of cross-mappings from a BLAST report
# or similar computational analysis:

# Tuc2009 (NC_002703) vs bIL285 (AF323668)
A_vs_B = [
    (99, "Tuc2009_01", "int"),
    (33, "Tuc2009_03", "orf4"),
    (94, "Tuc2009_05", "orf6"),
    (100, "Tuc2009_06", "orf7"),
    (97, "Tuc2009_07", "orf8"),
    (98, "Tuc2009_08", "orf9"),
    (98, "Tuc2009_09", "orf10"),
    (100, "Tuc2009_10", "orf12"),
    (100, "Tuc2009_11", "orf13"),
    (94, "Tuc2009_12", "orf14"),
    (87, "Tuc2009_13", "orf15"),
    (94, "Tuc2009_14", "orf16"),
    (94, "Tuc2009_15", "orf17"),
    (88, "Tuc2009_17", "rusA"),
    (91, "Tuc2009_18", "orf20"),
    (93, "Tuc2009_19", "orf22"),
    (71, "Tuc2009_20", "orf23"),
    (51, "Tuc2009_22", "orf27"),
    (97, "Tuc2009_23", "orf28"),
    (88, "Tuc2009_24", "orf29"),
    (26, "Tuc2009_26", "orf38"),
    (19, "Tuc2009_46", "orf52"),
    (77, "Tuc2009_48", "orf54"),
    (91, "Tuc2009_49", "orf55"),
    (95, "Tuc2009_52", "orf60"),
]

# bIL285 (AF323668) vs Listeria innocua prophage 5 (in NC_003212)
B_vs_C = [
    (42, "orf39", "lin2581"),
    (31, "orf40", "lin2580"),
    (49, "orf41", "lin2579"),  # terL
    (54, "orf42", "lin2578"),  # portal
    (55, "orf43", "lin2577"),  # protease
    (33, "orf44", "lin2576"),  # mhp
    (51, "orf46", "lin2575"),
    (33, "orf47", "lin2574"),
    (40, "orf48", "lin2573"),
    (25, "orf49", "lin2572"),
    (50, "orf50", "lin2571"),
    (48, "orf51", "lin2570"),
    (24, "orf52", "lin2568"),
    (30, "orf53", "lin2567"),
    (28, "orf54", "lin2566"),
]


def get_feature(features, id, tags=("locus_tag", "gene", "old_locus_tag")):
    """Search list of SeqFeature objects for an identifier under the given tags."""
    for f in features:
        for key in tags:
            # tag may not be present in this feature
            for x in f.qualifiers.get(key, []):
                if x == id:
                    return f
    raise KeyError(id)


gd_diagram = GenomeDiagram.Diagram(name)
feature_sets = {}
max_len = 0
for i, record in enumerate([A_rec, B_rec, C_rec]):
    max_len = max(max_len, len(record))
    # Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C
    # (empty tracks 2 and 4 add useful white space to emphasise the cross links
    # and also serve to make the tracks vertically more compressed)
    gd_track_for_features = gd_diagram.new_track(5 - 2 * i,
                                                 name=record.name,
                                                 greytrack=True, height=0.5,
                                                 start=0, end=len(record))
    assert record.name not in feature_sets
    feature_sets[record.name] = gd_track_for_features.new_set()

# We add dummy features to the tracks for each cross-link BEFORE we add the
# arrow features for the genes. This ensures the genes appear on top:
for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B),
                     ("AF323668", "NC_003212", B_vs_C)]:
    features_X = records[X].features
    features_Y = records[Y].features
    set_X = feature_sets[X]
    set_Y = feature_sets[Y]
    for score, x, y in X_vs_Y:
        color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick,
                                                 0, 100, score)
        border = colors.lightgrey
        f_x = get_feature(features_X, x)
        F_x = set_X.add_feature(SeqFeature(FeatureLocation(f_x.location.start,
                                                           f_x.location.end,
                                                           strand=0)),
                                color=color, border=border)
        f_y = get_feature(features_Y, y)
        F_y = set_Y.add_feature(SeqFeature(FeatureLocation(f_y.location.start,
                                                           f_y.location.end,
                                                           strand=0)),
                                color=color, border=border)
        gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))


for record, gene_colors in zip([A_rec, B_rec, C_rec],
                               [A_colors, B_colors, C_colors]):
    gd_feature_set = feature_sets[record.name]

    i = 0
    for feature in record.features:
        if feature.type != "gene":
            # Exclude this feature
            continue
        try:
            g_color = gene_colors[i]
        except IndexError:
            print("Don't have color for %s gene %i" % (record.name, i))
            g_color = grey
        gd_feature_set.add_feature(feature, sigil="BIGARROW",
                                   color=g_color, label=True,
                                   name=str(i + 1),
                                   label_position="start",
                                   label_size=6, label_angle=0)
        i += 1

gd_diagram.draw(format="linear", pagesize='A4', fragments=1,
                start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")

In [15]:
#import matplotlib.pyploy as plt

new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']

In [39]:
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO

wkdir="/home/minion/projects/genebanks/prokka_top10/"
records = list(SeqIO.parse(wkdir+"AAEN.gbk", "genbank"))
#record = SeqIO.read(wkdir+"AAEN.gbk", "genbank")

#Create a track, and a diagram
gd_diagram = GenomeDiagram.Diagram()
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
gd_feature_set = gd_track_for_features.new_set()

#Create the feature set and its feature objects,
for record in records:
    for feature in record.features:
        if feature.type != "gene":
            #Exclude this feature
            continue
        if len(gd_feature_set) % 2 == 0:
            color = new_colors[0]
        else:
            color = new_colors[2]
        #Now have to glue the bits together...
        gd_feature_set.add_feature(feature, sigil="ARROW",
                                   color=color, label=True,
                                   label_size = 8, label_angle=0)
    
gd_diagram.draw(format="circular", circular=True, pagesize=(21.0*cm,29.7*cm),
                start=0, end=len(record), circle_core=0.7)

gd_diagram.write(wkdir+"AAEN.svg", "SVG")

In [83]:
records = list(SeqIO.parse(wkdir+"AAEN.gbk", "genbank"))
for record in records:
    print(len(record.features))
#print(records[0].id)  # first record
#print(records[-1].id)  # last record

418
328
313
292
265
313
208
217
196
178
199
166
136
109
79
73
70
106
49
70
49
43
52
28
28
22
10
13
10
4
4
7
4
4
1
7
1
4
4
1
4
4
1
1
4
4
1
1
4
1
1
1
1
1
1
1
1
1
1
4
1
4
1
1
1
1
1
1
1
1
1
1
1
1
4
4
1
4
1
1
1
1
4
1
1
1


In [71]:
from Bio import SeqIO
for record in SeqIO.parse(wkdir+"AAEN.gbk", "genbank"):
    print(record.features)

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(130923), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(99), ExactPosition(876), strand=1), type='gene'), SeqFeature(FeatureLocation(ExactPosition(99), ExactPosition(876), strand=1), type='mRNA'), SeqFeature(FeatureLocation(ExactPosition(99), ExactPosition(876), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(977), ExactPosition(1607), strand=-1), type='gene'), SeqFeature(FeatureLocation(ExactPosition(977), ExactPosition(1607), strand=-1), type='mRNA'), SeqFeature(FeatureLocation(ExactPosition(977), ExactPosition(1607), strand=-1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(1609), ExactPosition(2473), strand=-1), type='gene'), SeqFeature(FeatureLocation(ExactPosition(1609), ExactPosition(2473), strand=-1), type='mRNA'), SeqFeature(FeatureLocation(ExactPosition(1609), ExactPosition(2473), strand=-1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(2871), ExactPosition(2964)

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(622), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(36), ExactPosition(567), strand=-1), type='gene'), SeqFeature(FeatureLocation(ExactPosition(36), ExactPosition(567), strand=-1), type='mRNA'), SeqFeature(FeatureLocation(ExactPosition(36), ExactPosition(567), strand=-1), type='CDS')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(574), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(568), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(507), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(504), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(503), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(493), strand=1), type='source')]
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(483), strand=1