In [None]:
### Step 1. Split reads

import os


work_dir = "/mnt/projects/thiomargarita/samples_data"

for file in os.listdir(work_dir):
    sample_name = file[0:11] # file = sample_name + .fastq
    
    os.system(f"fastq-dump --split-files {sample_name}")
    
    os.system(f"mkdir {work_dir}/{sample_name}")
    os.system(f"mv {work_dir}/{sample_name}*.fastq {work_dir}/{sample_name}")

In [None]:
### Step 2. Reads quality control before trimming

import os


work_dir = "/mnt/projects/thiomargarita/samples_data"

for sample_name in os.listdir(work_dir):
    for file in os.listdir(os.path.join(work_dir, sample_name)):
        os.system(f"fastqc -t 32 {sample_name}")

In [None]:
### Step 3. Trimming

import os


work_dir = "/mnt/projects/thiomargarita/samples_data"

for sample_name in os.listdir(work_dir):
    os.system("v2trim" + 
              f"-1 {work_dir}/{sample_name}/{sample_name}_1.fastq" + 
              f"-2 {work_dir}/{sample_name}/{sample_name}_2.fastq" + 
              "-t 32")

In [None]:
### Step 4. Reads quality control after trimming

'''
moved all raw data in each folder to SRR*/raw
moved all fastqc results in each folder to SRR*/fastqc_results
'''


In [None]:
### Step 5. Assembly

import os


work_dir = "/mnt/projects/thiomargarita/samples_data"

for sample_name in os.listdir(work_dir):
    os.system(f"spades.py --meta -o {work_dir}/{sample_name}/spades_results" + 
              f"-1 {work_dir}/{sample_name}/{sample_name}_trim_1.fastq" + 
              f"-2 {work_dir}/{sample_name}/{sample_name}_trim_2.fastq" + 
              "-t 32 -k 23,67,99,125")

In [None]:
### Step 6. Comparison of reads and assemblies

'''
multiqc, quast -l
'''

In [None]:
### Step 7. Annotation

import os


!export EGGNOG_DATA_DIR=/mnt/projects/users/merirut/software/miniconda3/eggnog-mapper-data
work_dir = "/mnt/projects/thiomargarita/samples_data"

for sample_name in os.listdir(work_dir):
        os.system(f"prokka --outdir {work_dir}/{sample_name}/prokka_results" + 
                  f"--prefix {sample_name} --metagenome --locustag gene --cpus 32" + 
                  f"{work_dir}/{sample_name}/spades_results/scaffolds.fasta")
        
        os.system(f"emapper.py -i {work_dir}/{sample_name}/prokka_results/{sample_name}.faa" + 
                  f"--cpu 32 --output sample_name --output_dir {work_dir}/{sample_name}/emapper_results")

In [None]:
### Step 8. Extract rRNAs and align them to the reference genome
import os


work_dir = "/mnt/projects/thiomargarita/samples_data"

for sample_name in os.listdir(work_dir):
    os.system(f"barrnap {work_dir}/spades_results/scaffolds.fasta" +
              f"--threads 32 --outseq {work_dir}/{sample_name}/{sample_name}_rRNAs.fasta")
    
    outfile = f"{work_dir}/{sample_name}/{sample_name}_blast.txt"
    os.system(f"blastn -query {work_dir}/{sample_name}/{sample_name}_rRNAs.fasta" + 
              f"-subject {subject_name} -out {outfile}")

In [None]:
### Step 9. Create "pangenome"

import os


dir = "/mnt/projects/thiomargarita/samples_data"
fasta_file = open("/mnt/projects/thiomargarita/protein_pangenome.fasta", 'w')

for sample_name in os.listdir(dir):
    
    try:
        infile = open(f"{dir}/{sample_name}/prokka_results/{sample_name}.faa", 'r')

        for line in infile:

            if line[0] == '>':
                fasta_file.write('>' + f"{sample_name}_" + line[1::].replace(' ', '_'))

            else:
                fasta_file.write(line)

        infile.close()
    
    except FileNotFoundError:
        pass

fasta_file.close()

In [None]:
### Step 10. Clustering

!mmseqs easy-cluster protein_pangenome.fasta clustered_pangenome tmp