In [None]:
import matplotlib, re, os, glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from Bio import SeqIO
import seaborn as sns
import subprocess as sp
sns.set('notebook')
%matplotlib inline 

N.B. if this draws errors, check all packages installed via `pip`.

### set up data, paths, functions

In [None]:
# path to concatenated protein file
# must be in prodigal format - (\S+) # ([0-9]+) # ([0-9]+) # ([1-]+) .+"
ppath = "FILL_THIS_IN"
# path to foci file - list of gene names of interest
# separated by \n
fpath = "FILL_THIS_IN"
# output directory
outdir = "FILL_THIS_IN"
# path to protein clustering programs
pcpath = "/groups/banfield/users/meheurap/proteinClusteringPipeline/scripts/"
# path to pullseq
pspath = "/shared/software/bin/pullseq"

In [None]:
def cmdir(path):
    
    if not os.path.isdir(path):
        os.mkdir(path)

In [None]:
def scaffold(gene):
    if gene != "None":
        try: return re.search("(.+?)_[0-9]+$", gene).group(1)
        except: print(gene)

### compute genomic context table

In [None]:
# build contig db for all proteins
contig_db = {}

for record in SeqIO.parse(open(ppath), "fasta"):

    m = re.search("(\S+) # ([0-9]+) # ([0-9]+) # ([1-]+) .+", record.description)
    scaf = scaffold(m.group(1))
    if scaf not in contig_db:
        contig_db[scaf] = [m.group(1)]
    else:
        contig_db[scaf].append(m.group(1))

In [None]:
# retrieve strandedness information
strand_dict = {}

for record in SeqIO.parse(open(ppath,"r"), "fasta"):
    gene_name = record.description.split("#")[0]
    strand = record.description.split("#")[3]
    strand_dict[gene_name.strip()] = strand.strip()

In [None]:
# define search radius (# ORFs)
radius = "FILL_THIS_IN"

In [None]:
# define list of focal genes
foci = [line.strip() for line in open(fpath).readlines()]

neighbors = {}
for focus in foci:
    if scaffold(focus) in contig_db:
        # get genes assc with contig
        gene_array = contig_db[scaffold(focus)]
        # define span, considering contig ends
        upper_bound = min(len(gene_array)-1, gene_array.index(focus) + radius)
        lower_bound = max(0, gene_array.index(focus) - radius)
        # save results for downstream analysis
        for i in range(lower_bound, upper_bound+1):
            # don't include focus
            if i != gene_array.index(focus):
                neighbors[gene_array[i]] = {"position": i-gene_array.index(focus), "focus":focus}

# create genomic context table with above results
neighbor_df = pd.DataFrame.from_dict(neighbors, orient="index")
neighbor_df = neighbor_df.reset_index()
neighbor_df.columns = ["gene", "position", "focus"]

In [None]:
#adjust position by strand orientation
neighbor_df["gene_strand"] = neighbor_df["gene"].apply(lambda x: strand_dict[x])
neighbor_df["focus_strand"] = neighbor_df["focus"].apply(lambda x: strand_dict[x])
neighbor_df["adj_position"] = neighbor_df.apply(lambda x: int(x["position"])*int(x["gene_strand"]), axis=1)
neighbor_df.head()

### run clustering pipeline on proximal proteins

For each of the following steps, the python code will construct a 
bash command that can be copy+pasted into your terminal session to be executed.

In [None]:
# write fasta subset for protein clustering
cmdir(outdir + "/protein_clustering/")

with open(outdir + "/protein_clustering/neighbors.txt", "w") as outfile:
    
    for key, row in neighbor_df.iterrows():
        outfile.write(row["gene"] + "\n")

call = pspath + " -n " + outdir + "/protein_clustering/neighbors.txt -i " + \
        ppath + " > " + outdir + "/protein_clustering/neighbors.faa"
#sp.call(call, shell=True)
print(call)

In [None]:
# start by subfamily clustering
call = pcpath + "/subfamilies.py --output-directory " + \
    outdir + "/protein_clustering/output/ --cpu 16 " + \
    outdir + "/protein_clustering/neighbors.faa"
#sp.call(call, shell=True)
print(call)

In [None]:
# then do hmm-hmm comparison to generate families
call1 = pcpath + "/hhblits.py --cpu 16 " + outdir + "/protein_clustering/output/config.json"
print(call1)
#sp.call(call1, shell=True)
#N.B see program docs for info on --coverage parameter
call2 = pcpath + "/runningMclClustering.py --coverage 0.50 --fasta --cpu 16 " + outdir + "/protein_clustering/output/config.json"
#sp.call(call2, shell=True)
print(call2)

### integrate and plot

In [None]:
# integrate with family stuff (see miscellaneous)
fams = {}

count = 1
for line in open(outdir + "/protein_clustering/output/orf2family.tsv").readlines():
    # skip headers
    if count != 1:
        splt = line.strip().split("\t")
        fams[splt[0]] = splt[1]
    count +=1

# map subfams to neighbors
neighbor_df["fam"] = neighbor_df["gene"].apply(lambda x: fams[x] if x in fams else "None")
# add metadata about your foci here, if you want to facet by clade, organism, etc
neighbor_df["type"] = "foci_set_1"

In [None]:
# how many families to display? eg, 20, 15
families_to_display = "FILL_THIS_IN"

In [None]:
# define subset and groupby
for item in neighbor_df["type"].unique():
    
    # first line - subsets table to match type and stipulates same strand
    # if agnostic to strand orientation, remove second part of conditional
    phy_df = neighbor_df[(neighbor_df["type"]==phy)  & (neighbor_df["gene_strand"]==neighbor_df["focus_strand"])]
    # subset most common families to reduce plotting complexity
    top_families = list(phy_df["fam"].value_counts()[0:(families_to_display+1)].index)
    top_families = [top for top in top_families if top !="None"]
    
    # subsets table to selected families
    ng = phy_df[(phy_df["fam"].isin(top_families))].groupby(["fam", "adj_position"], as_index=False).count()
    ng = ng[["fam", "adj_position", "focus"]]
    ng.columns = ["fam", "adj_position", "count"]
    # restrict radius
    ng = ng[abs(ng["adj_position"])<=radius]
    
    # finally, plot
    sns.set(font_scale=1)
    sns.set_style("white", {"axes.edgecolor": "0.8"})
    kws = dict(linewidth=.5, edgecolor="black")
    g = sns.relplot("adj_position", "fam", data=ng, size="count", hue="fam", 
        alpha=1, aspect=2, sizes=(50,500), **kws, legend="brief")
    # add matrix lines
    for i in range(-radius,max((radius+1),(families_to_display+1)),1):
        if i >= 0 and i < (families_to_display):
            plt.axhline(i, color='grey', linestyle='-', lw=0.5,zorder=0)
        if i==0:
            plt.axvline(i, color='black', linestyle='-', lw=2,zorder=1)
        if i <= radius:
            plt.axvline(i, color='grey', linestyle='-', lw=0.5,zorder=0)
    g.set(xticks=[i for i in range(-radius,(radius+1),2)])
    plt.ylabel("")
    plt.xlabel("relative gene position")
    sns.despine(left=False, bottom=True, top=True, right=False)
    plt.show()