In [0]:
import sys
import os
import pandas as pd
import numpy as np
import scipy as sp
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import Bio
from Bio.Blast.Applications import NcbiblastxCommandline
from IPython.display import display
from utils import read_df, save_df
from Bio import SeqIO

In [0]:
!uname -a

In [0]:
bombyx = "/gpfs_fs/home/eckertlab/genomes/Bombyx_mori/protein/protein.fa"
plutella = "/gpfs_fs/home/eckertlab/genomes/Plutella_xylostella/protein/protein.fa"

## Create blast databases

```bash
~/g/src/ncbi-blast-2.2.30+/bin/makeblastdb -in protein.fa -dbtype prot
```

In [0]:
blastx = "/home/cfriedline/g/src/ncbi-blast-2.2.30+/bin/blastx"
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/"

In [0]:
outfmt = "'6 qseqid sseqid sgi sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore qcov qcovhsp qlen slen ppos'"

In [0]:
outfiles = []
clines = []
for org, db in list({'bombyx': bombyx, 'plutella': plutella}.items()):
    outdir = os.path.join(analysis_dir, "blast")
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    out = os.path.join(outdir, "%s.txt" % org)
    outfiles.append(out)
    cline = NcbiblastxCommandline(cmd=blastx,
                                 out=out,
                                 db=db,
                                 query=assembly,
                                 outfmt=outfmt,
                                 max_target_seqs=5,
                                 num_threads=20,
                                 evalue=1e-5)
    clines.append(cline)

In [0]:
with open(os.path.join(analysis_dir, "blast.sh"), "w") as o:
    for cline in clines:
        o.write("%s\n" % cline)

```bash
~/bin/parallel --no-notice --bar -a blast.sh
```

In [0]:
header = outfmt[1:-1].split()[1:]

In [0]:
z12_dir = os.path.join(analysis_dir, "beagle40")
z12_swapped = read_df(z12_dir, "z12_swapped")

In [0]:
snp_names = set(z12_swapped.columns[:-2])

len(snp_names)

snp_name_only = sorted(set(["_".join(x.split("_")[0:-1]) for x in snp_names]))

In [0]:
assembly_dir = os.path.dirname(assembly)
contigs_with_snps = os.path.join(assembly_dir, "contigs_with_snps.fa")
seq_index = SeqIO.index(assembly, "fasta")

In [0]:
len(snp_name_only)

In [0]:
snp_df = pd.DataFrame(pd.Series(list(snp_names)), columns=['snp'])
snp_df['loc'] = snp_df.snp.apply(lambda x: int(x.split("_")[-1]))
snp_df['contig'] = snp_df.snp.apply(lambda x: "_".join(x.split("_")[0:-1]))
snp_df.index = snp_df.snp
snp_df = snp_df.drop("snp", axis=1)
snp_df = snp_df.reindex(columns=sorted(snp_df.columns))

In [0]:
snp_df.to_csv("contigs_with_snps.txt", index=False, header=True, sep="\t")

In [0]:
contigs_with_snps

In [0]:
with open(contigs_with_snps, "w") as o:
    for rec in snp_name_only:
        SeqIO.write(seq_index[rec], o, "fasta")