In [None]:
## Move to main directory
# Replace with your own path
os.chdir('ChIP-seq')

In [None]:
## Make directory tree

if not 'data_processing' in os.listdir('.'):
    os.mkdir('data_processing')
    os.mkdir('data_processing/genome')
    os.mkdir('data_processing/raw_data')
    os.mkdir('data_processing/trimmed_data')
    os.mkdir('data_processing/aligned_data')
    os.mkdir('data_processing/aligned_data/wiggle_files')
    
os.chdir('./data_processing')

# Download and process reference genome

In [None]:
# Download genome information from NCBI

! rsync --copy-links --recursive --times --verbose rsync://genomes/refseq/archaea/Halobacterium_salinarum/all_assembly_versions/GCF_000006805.1_ASM680v1 ./genome/


In [None]:
# Make genome files writable

! chmod a+rw ./genome/GCF_000006805.1_ASM680v1/*.gz
! chmod a+rw ./genome/GCF_000006805.1_ASM680v1/*.txt


In [None]:
# Rename sequence file and decompress

shutil.copy2('./genome/GCF_000006805.1_ASM680v1/GCF_000006805.1_ASM680v1_genomic.fna.gz', './genome/NRC1.fna.gz')
shutil.copy2('./genome/GCF_000006805.1_ASM680v1/GCF_000006805.1_ASM680v1_genomic.gff.gz', './genome/NRC1.gff.gz')

! gunzip ./genome/*.gz

In [None]:
# Build genome index library for Bowtie2

! bowtie2-build ./genome/NRC1.fna ./genome/NRC1

In [None]:
# Create annotation bed file (for peak annotations using bedtools)

! sortBed -i ./genome/NRC1.gff > ./genome/NRC1_annotations.bed

# Processing fastq.gz data files

In [None]:
## Download data files into ~/data_processing/raw_data
# These are available from GEO accession GSE148065

# This assumes the original files are backed up on a server/cloud.
# Rename and decompress

os.rename('./raw_data/AV-S78_S69_L002_R1_001.fastq.gz', './raw_data/0195FLAG1log_IP.fastq.gz')
os.rename('./raw_data/AV-S80_S56_L002_R1_001.fastq.gz', './raw_data/0195FLAG2log_IP.fastq.gz')
os.rename('./raw_data/AV-S82_S57_L002_R1_001.fastq.gz', './raw_data/0195FLAG3log_IP.fastq.gz')
os.rename('./raw_data/AV-S77_S29_L002_R1_001.fastq.gz', './raw_data/0195FLAG1log_WCE.fastq.gz')
os.rename('./raw_data/AV-S79_S8_L002_R1_001.fastq.gz', './raw_data/0195FLAG2log_WCE.fastq.gz')
os.rename('./raw_data/AV-S81_S41_L002_R1_001.fastq.gz', './raw_data/0195FLAG3log_WCE.fastq.gz')
os.rename('./raw_data/AV-S88_S60_L002_R1_001.fastq.gz', './raw_data/0195FLAG1sta_IP.fastq.gz')
os.rename('./raw_data/AV-S90_S63_L002_R1_001.fastq.gz', './raw_data/0195FLAG2sta_IP.fastq.gz')
os.rename('./raw_data/AV-S92_S49_L002_R1_001.fastq.gz', './raw_data/0195FLAG3sta_IP.fastq.gz')
os.rename('./raw_data/AV-S87_S7_L002_R1_001.fastq.gz', './raw_data/0195FLAG1sta_WCE.fastq.gz')
os.rename('./raw_data/AV-S89_S15_L002_R1_001.fastq.gz', './raw_data/0195FLAG2sta_WCE.fastq.gz')
os.rename('./raw_data/AV-S91_S30_L002_R1_001.fastq.gz', './raw_data/0195FLAG3sta_WCE.fastq.gz')
os.rename('./raw_data/AV-S76_S67_L002_R1_001.fastq.gz', './raw_data/ura3FLAGlog_IP.fastq.gz')
os.rename('./raw_data/AV-S75_S6_L002_R1_001.fastq.gz', './raw_data/ura3FLAGlog_WCE.fastq.gz')
os.rename('./raw_data/AV-S86_S62_L002_R1_001.fastq.gz', './raw_data/ura3FLAGsta_IP.fastq.gz')
os.rename('./raw_data/AV-S85_S43_L002_R1_001.fastq.gz', './raw_data/ura3FLAGsta_WCE.fastq.gz')



! gunzip ./raw_data/*.fastq.gz

In [None]:
! ls ./raw_data

In [None]:
## Assess quality of reads

! fastqc -q ./raw_data/*.fastq

In [None]:
! open ./raw_data/*.html

In [None]:
## Trim adapter sequences

! trim_galore ./raw_data/*.fastq -o ./trimmed_data

In [None]:
%%bash

## Align files with Bowtie2

cd ./trimmed_data
for file in *_trimmed.fq; do
bowtie2 -x ../genome/DS2 -U $file -S ../aligned_data/`basename $file .fq`.sam
done

In [None]:
## Rename, to remove the "trimmed" from file names

%cd ./aligned_data

os.rename('0195FLAG1log_IP_trimmed.sam', '0195FLAG1log_IP.sam')
os.rename('0195FLAG2log_IP_trimmed.sam', '0195FLAG2log_IP.sam')
os.rename('0195FLAG3log_IP_trimmed.sam', '0195FLAG3log_IP.sam')
os.rename('0195FLAG1log_WCE_trimmed.sam', '0195FLAG1log_WCE.sam')
os.rename('0195FLAG2log_WCE_trimmed.sam', '0195FLAG2log_WCE.sam')
os.rename('0195FLAG3log_WCE_trimmed.sam', '0195FLAG3log_WCE.sam')
os.rename('0195FLAG1sta_IP_trimmed.sam', '0195FLAG1sta_IP.sam')
os.rename('0195FLAG2sta_IP_trimmed.sam', '0195FLAG2sta_IP.sam')
os.rename('0195FLAG3sta_IP_trimmed.sam', '0195FLAG3sta_IP.sam')
os.rename('ura3FLAGlog_IP_trimmed.sam', 'ura3FLAGlog_IP.sam')
os.rename('ura3FLAGlog_WCE_trimmed.sam', 'ura3FLAGlog_WCE.sam')
os.rename('ura3FLAGsta_IP_trimmed.sam', 'ura3FLAGsta_IP.sam')
os.rename('ura3FLAGsta_WCE_trimmed.sam', 'ura3FLAGsta_WCE.sam')

# Alignment file processing

In [None]:
%%bash

## Alignment processing
# Convert sam to bam file

for file in *.sam; do
samtools view -bS $file > `basename $file .sam`.bam;
done

In [None]:
%%bash

# Sort bam files

for file in *.bam; do
samtools sort $file -o `basename $file .bam`_sorted.bam;
done

In [None]:
%%bash

# Index bam files

for file in *_sorted.bam; do
samtools index $file `basename $file .bam`.bam.bai;
done

## Generate wiggle files with MOSAiCS

For data visualization purposes.
Run Generate_wiggle_files.R

For details, see documentation of [MOSAiCS package](https://bioconductor.org/packages/release/bioc/html/mosaics.html)