# Process fastq files

## Build a reference
Download reference files used by Cell Ranger from: https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest

Build the reference:

In [None]:
# specify directories/files
!ref_dir="/Users/sara/Documents/Tools/kallisto-bustools/mouse-ref"
!dna_fa="${ref_dir}/fasta/genome.fa"
!gtf="${ref_dir}/genes/genes.gtf"

# run kb ref
!kb ref -i ${ref_dir}transcriptome.idx -g ${ref_dir}t2g.txt -f1 ${ref_dir}cdna.fa $dna_fa $gtf

## Check 10x chemistry from fastq files

Download 10x barcodes whitelists from: https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes

Check 10x chemistry using one of the fastq files, assuming that all fastq files were generated with the same chemistry

In [179]:
# specify fastq directory
from os import environ
fastq_dir = '/Users/sara/Documents/Data/kaitilin/fastq/'

environ['fastq_dir'] = fastq_dir

In [93]:
# get barcodes list from R1 file (assuming barcode length of 16 bases) 
!gzcat /Users/sara/Documents/Data/kaitilin/fastq/C3-3_S1_L001_R1_001.fastq.gz | grep "@" -A 1 | grep -v "@" | grep -v "\-\-" | cut -c 1-16 > /Users/sara/Documents/Data/kaitilin/fastq/R1_bc

In [198]:
# Take the first R1 fastq file of the list, instead of inputting a specific file name - NOT WORKING
# !ff=$(ls $fastq_dir*R1*gz | head -n 1)

# get barcodes list from R1 file (assuming barcode length of 16 bases) 
# !gzcat $ff | grep "@" -A 1 | grep -v "@" | grep -v "\-\-" | cut -c 1-16 > ${ff}R1_bc

In [102]:
# read barcodes list and whitelists
import pandas as pd

fold = [fastq_dir,"R1_bc"]
fpath = "".join(fold)
r1bc = pd.read_csv(fpath, header = None)
wht_v3 = pd.read_csv('/Path/to/whitelists/3M-february-2018.txt', header = None)
wht_v2 = pd.read_csv('/Path/to/whitelists/737K-august-2016.txt', header = None) 
wht_v1 = pd.read_csv('/Path/to/whitelists/737K-april-2014_rc.txt', header = None) 

# calculate fractions of barcodes overlapping the whitelist for each specific chemistry
v3 = pd.merge(r1bc, wht_v3, how='inner', on=[0])
v2 = pd.merge(r1bc, wht_v2, how='inner', on=[0])
v1 = pd.merge(r1bc, wht_v1, how='inner', on=[0])

In [113]:
# get max value for fractions of overlapping barcodes
maxf = max(v3.size, v2.size, v1.size) 

In [114]:
# identify 10x chemistry
if v3.size == maxf:
  print("10X chemistry is V3")
  tec = "10xv3"
elif v2.size == maxf:
  print("10X chemistry is V2")
  tec = "10xv2"
else:
  print("10X chemistry is V1")
  tec = "10xv1"

10X chemistry is V3


## Check species

Check species with FastQ Screen to align fastq files to the correct genome

Download and install FastQ Screen: https://stevenwingett.github.io/FastQ-Screen/

Obtain pre-built Bowtie2 indices of commonly used reference genomes for FastQ Screen:

In [None]:
!fastq_screen --get_genomes

Use the configuration file produces by the previous command, which lists the correct paths to the newly downloaded reference genomes and is located in ./FastQ-Screen-0.15.2/FastQ_Screen_Genomes/fastq_screen.conf

In [None]:
# run FastQ Screen
!fastq_screen /Users/sara/Documents/Data/kaitilin/fastq/C1-1_S1_L001_R2_001.fastq.gz -conf /Users/sara/Documents/Tools/fastq-screen/FastQ-Screen-0.15.2/FastQ_Screen_Genomes/fastq_screen.conf

In [58]:
# read FastQ Screen output
fsout = pd.read_csv('C3-3_S1_L001_R2_001_screen.txt', sep='\t', engine='python', skiprows=1)

In [None]:
# read FastQ Screen output
fsout = pd.read_csv($ff, sep='\t', engine='python', skiprows=1)

In [80]:
# find min value of %Unmapped
unmapped = fsout["%Unmapped"]
min_index = unmapped.idxmin()

In [118]:
# get species
genome = fsout["Genome"]
species = genome[min_index].lower()
print("Species is", species)

Species is mouse


## Generate a raw count matrix

In [None]:
# run kb count
!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o /Users/sara/Documents/Data/kaitilin/C1_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/C1-1_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/C1-1_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o C2_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/C2-2_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/C2-2_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o C3_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/C3-3_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/C3-3_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o D1_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/D1-1_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/D1-1_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o D2_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/D2-2_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/D2-2_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o D3_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/D3-3_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/D3-3_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o P1_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/P1-1_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/P1-1_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o P2_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/P2-2_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/P2-2_S1_L001_R2_001.fastq.gz

!kb count -i /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reftranscriptome.idx -g /Users/sara/Documents/Tools/kallisto-bustools/"$species"-ref/"$species"-reft2g.txt -x $tec -o P3_kbcount_output \
/Users/sara/Documents/Data/kaitilin/fastq/P3-3_S1_L001_R1_001.fastq.gz \
/Users/sara/Documents/Data/kaitilin/fastq/P3-3_S1_L001_R2_001.fastq.gz


## Convert to 10x files

See script conv_kbout_to_10x.R