In [1]:
import argparse
import logging
import os
import sys

gitpath=os.path.expanduser("~/git/mapseq-processing")
sys.path.append(gitpath)

gitpath=os.path.expanduser("~/git/cshlwork")
sys.path.append(gitpath)

from configparser import ConfigParser

import pandas as pd

from cshlwork.utils import JobRunner, JobStack, JobSet
# for processing initial fastq
from mapseq.core import get_default_config, load_sample_info, load_barcodes, process_fastq_pair, make_summaries
# for processing barcode-specific fasta files
from mapseq.core import make_counts_df, do_threshold, filter_barcodes, write_fasta_for_bowtie
# for bowtie
from alignment.bowtie import run_bowtie, make_bowtie_df, matrix_df_from_btdf

In [2]:
# Setup
cp = get_default_config()
sampleinfo = os.path.expanduser('~/project/mapseq/M205test/Mseq205_sampleinfo.xlsx')
barcodes = os.path.expanduser('~/project/mapseq/M205test/barcode_v2.txt')
outdir = os.path.expanduser('~/project/mapseq/M205testout')
infiles = [
    os.path.expanduser('~/project/mapseq/M205test/M205_HZ_S1_R1_001.fastq.gz'),
    os.path.expanduser('~/project/mapseq/M205test/M205_HZ_S1_R2_001.fastq.gz')
          ]
#logging.getLogger().setLevel(logging.DEBUG)   
logging.getLogger().setLevel(logging.INFO)   

In [3]:
sampdf = load_sample_info(cp, sampleinfo)
sampdf

Unnamed: 0,usertube,ourtube,samplename,siteinfo,rtprimer,brain,col_num
1,OB,1.0,Olfactory Bulb,,1.0,YW143,1.0
2,ACB,2.0,ACB,,2.0,YW143,2.0
3,AI,3.0,AI,,3.0,YW143,3.0
4,CP,4.0,CP (dorsal part - can just dissect out the top...,,4.0,YW143,4.0
5,MTN,5.0,MTN,,5.0,YW143,5.0
6,BLAa,6.0,BLAa,,6.0,YW143,6.0
7,PIR,7.0,Piriform Cortex,,7.0,YW143,7.0
8,VTA,8.0,VTA,,8.0,YW143,8.0
9,TeA,9.0,TeA,,9.0,YW143,9.0
10,ENTl,10.0,ENTl,,10.0,YW143,10.0


In [4]:
# extract list of actually used primers.
rtlist = list(sampdf.rtprimer.dropna())
len(rtlist)

26

In [5]:
# make barcode handler objects
bcolist = load_barcodes(cp, barcodes, labels=rtlist, outdir=outdir)
len(bcolist)

26

In [6]:
# handle all the input. usually takes ~25 minutes
bcnum = "6"
bc1file = os.path.expanduser(f'~/project/mapseq/M205testout/BC{bcnum}.fasta')
if not ( os.path.exists(bc1file) and os.path.getsize(bc1file) > 1 ) :
    process_fastq_pair(cp, infiles[0], infiles[1], bcolist, outdir=outdir)
else:
    print(f'Non-empty BC{bcnum}.fasta exists. Not recalculating...\n')
!ls ~/project/mapseq/M205testout/

2023-01-26 13:41:19,003 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 1000000 reads. matched=12455 unmatched=987545
2023-01-26 13:42:46,493 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 2000000 reads. matched=24883 unmatched=1975117
2023-01-26 13:44:15,788 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 3000000 reads. matched=37377 unmatched=2962623
2023-01-26 13:45:46,344 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 4000000 reads. matched=49790 unmatched=3950210
2023-01-26 13:47:13,372 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 5000000 reads. matched=62201 unmatched=4937799
2023-01-26 13:48:39,224 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 6000000 reads. matched=74843 unmatched=5925157
2023-01-26 13:50:03,961 (UTC) [ INFO ] core.py:311 root.process_fastq_pair(): handled 7000000 reads. matched=87214 unmatched=6912786
2023-01-26 13:51:27,934 (UTC) [ INFO ] core.py:311 root.process_fastq_

BC1.bc.seq.bowtie BC16.fasta        BC22.fasta        BC6.bc.seq.bowtie
BC1.bc.seq.fasta  BC17.fasta        BC23.fasta        BC6.bc.seq.fasta
BC1.counts.tsv    BC18.fasta        BC24.fasta        BC6.counts.tsv
BC1.fasta         BC19.fasta        BC25.fasta        BC6.fasta
BC10.fasta        BC2.bc.seq.bowtie BC26.fasta        BC7.fasta
BC11.fasta        BC2.bc.seq.fasta  BC3.bc.seq.fasta  BC8.fasta
BC12.fasta        BC2.counts.tsv    BC3.counts.tsv    BC9.fasta
BC13.fasta        BC2.fasta         BC3.fasta         [34mindexes[m[m
BC14.fasta        BC20.fasta        BC4.fasta         unmatched.fasta
BC15.fasta        BC21.fasta        BC5.fasta


In [133]:
#process_bcfasta(cp, bc1file)
filepath = os.path.abspath(bc1file)    
dirname = os.path.dirname(filepath)
filename = os.path.basename(filepath)
(base, ext) = os.path.splitext(filename)
logging.getLogger().setLevel(logging.INFO)   
cdf = make_counts_df(cp, bc1file)
of = os.path.join(dirname , f'{base}.counts.tsv')
cdf.to_csv(of, sep='\t') 
!ls ~/project/mapseq/M205testout/BC*
cdf

2022-12-13 16:54:34,190 (UTC) [ INFO ] core.py:117 root.make_counts_df(): kept 26658 non-'N' sequences out of 26658


/Users/jhover/project/mapseq/M205testout/BC1.bc.seq.bowtie
/Users/jhover/project/mapseq/M205testout/BC1.bc.seq.fasta
/Users/jhover/project/mapseq/M205testout/BC1.counts.tsv
/Users/jhover/project/mapseq/M205testout/BC1.fasta
/Users/jhover/project/mapseq/M205testout/BC10.fasta
/Users/jhover/project/mapseq/M205testout/BC11.fasta
/Users/jhover/project/mapseq/M205testout/BC12.fasta
/Users/jhover/project/mapseq/M205testout/BC13.fasta
/Users/jhover/project/mapseq/M205testout/BC14.fasta
/Users/jhover/project/mapseq/M205testout/BC15.fasta
/Users/jhover/project/mapseq/M205testout/BC16.fasta
/Users/jhover/project/mapseq/M205testout/BC17.fasta
/Users/jhover/project/mapseq/M205testout/BC18.fasta
/Users/jhover/project/mapseq/M205testout/BC19.fasta
/Users/jhover/project/mapseq/M205testout/BC2.bc.seq.bowtie
/Users/jhover/project/mapseq/M205testout/BC2.bc.seq.fasta
/Users/jhover/project/mapseq/M205testout/BC2.counts.tsv
/Users/jhover/project/mapseq/M205testout/BC2.fasta
/Users/jhover/

Unnamed: 0,sequence,counts
0,CAATGTGGACGGTAAATTGGTTTTATGGTGCCCAATGTGGACGG,3988
1,TATTTATGGTCAGATGTGCATGGTTCACGTTTTATTTATGGTCA,702
2,AGGGAATCCCTAAGTCTTTATGGTCAAGTTTCAGGGAATCCCTA,649
3,ATTTATGGTTATTTTTATGTGTATTTTCTTTTATTTATGGTTAT,649
4,CTTTATGGTTCAATCTGAACCCTCCCAGCTTTCTTTATGGTTCA,455
...,...,...
985,CTCGTCGCTGCGTTGAGTCCTGCGTTTATGGTCTCGTCGCTGCG,1
986,TTCATCGTCACGTTTATGGTGAACAGTGGATTTTCATCGTCACG,1
987,GTTTATGGTTTGGGGTTCACCTCCCGGTTCTTGTTTATGGTTTG,1
988,TTGTCATATGCTCGCTCGAGAATTTATGGTGCTTGTCATATGCT,1


In [134]:
# 
tdf = do_threshold(cp, cdf)
tdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sequence'] = df.sequence.str[:spend]


Unnamed: 0,sequence,counts
0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,3988
1,TATTTATGGTCAGATGTGCATGGTTCACGTTT,702
2,AGGGAATCCCTAAGTCTTTATGGTCAAGTTTC,649
3,ATTTATGGTTATTTTTATGTGTATTTTCTTTT,649
4,CTTTATGGTTCAATCTGAACCCTCCCAGCTTT,455
...,...,...
416,GAATCCTTCTGTTTATGGTTTAATATTTACTC,3
417,TATGGTAGGGAGTGGTTGGATGCTCCACCATT,3
418,TTCTTGTGGGCTTCAATTGGACATTTATGGTT,3
419,TTGCATGAATGTTTATGGTGTTTATGCTTTTC,3


In [135]:
bcdf = filter_barcodes(cp, tdf)
bcdf

Unnamed: 0,sequence,counts
0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,3988
1,TATTTATGGTCAGATGTGCATGGTTCACGTTT,702
2,AGGGAATCCCTAAGTCTTTATGGTCAAGTTTC,649
3,ATTTATGGTTATTTTTATGTGTATTTTCTTTT,649
4,CTTTATGGTTCAATCTGAACCCTCCCAGCTTT,455
...,...,...
416,GAATCCTTCTGTTTATGGTTTAATATTTACTC,3
417,TATGGTAGGGAGTGGTTGGATGCTCCACCATT,3
418,TTCTTGTGGGCTTCAATTGGACATTTATGGTT,3
419,TTGCATGAATGTTTATGGTGTTTATGCTTTTC,3


In [139]:
bctool = cp.get('bcfasta','tool')
of = os.path.join(dirname , f'{base}.bc.seq.fasta')
seqfasta = write_fasta_for_bowtie(cp, bcdf, outfile=of)
seqfasta

2022-12-13 16:59:55,145 (UTC) [ INFO ] utils.py:615 root.dataframe_to_seqlist(): made list of 216 SeqRecords


'/Users/jhover/project/mapseq/M205testout/BC6.bc.seq.fasta'

In [140]:
# run bowtie on sequence set. 
of = os.path.join(dirname , f'{base}.bc.seq.{bctool}')
afile = run_bowtie(cp, seqfasta, of, tool=bctool )
afile

2022-12-13 16:59:59,747 (UTC) [ INFO ] bowtie.py:35 root.run_bowtie(): running allxall bowtie on /Users/jhover/project/mapseq/M205testout/BC6.bc.seq.fasta -> /Users/jhover/project/mapseq/M205testout/BC6.bc.seq.bowtie
2022-12-13 16:59:59,915 (UTC) [ INFO ] bowtie.py:67 root.run_bowtie(): bowtie-build done.
2022-12-13 17:00:00,117 (UTC) [ INFO ] bowtie.py:103 root.run_bowtie(): bowtie done.


'/Users/jhover/project/mapseq/M205testout/BC6.bc.seq.bowtie'

In [141]:
btdf = make_bowtie_df(afile)
btdf

Unnamed: 0,name_read,strand,name_align,offset,seq,quals,ceil,mm_desc
0,0,+,0,0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,
1,0,+,282,0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,29:A>G
2,0,+,299,0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,20:G>T
3,0,+,399,0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,7:T>G
4,0,+,398,0,CAATGTGGACGGTAAATTGGTTTTATGGTGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,7:A>G
...,...,...,...,...,...,...,...,...
235,416,+,416,0,GAATCCTTCTGTTTATGGTTTAATATTTACTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,
236,417,+,417,0,TATGGTAGGGAGTGGTTGGATGCTCCACCATT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,
237,418,+,418,0,TTCTTGTGGGCTTCAATTGGACATTTATGGTT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,
238,419,+,419,0,TTGCATGAATGTTTATGGTGTTTATGCTTTTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,


In [145]:
labels = np.unique(btdf[['name_read','name_align']])
labels
sdf = btdf.filter( ['name_read','name_align'], axis=1   )
sdf['val'] = True
sdf
mdf = sdf.pivot(index = 'name_read', 
                columns='name_align', 
                values='val').reindex(columns=labels, index=labels, fill_value=False)
mdf
mdf.sum()


name_align
0      5
1      1
2      1
3      2
4      1
      ..
416    1
417    1
418    1
419    1
420    1
Length: 216, dtype: object