步骤1：使用Python脚本生成SRA ID的列表
首先，使用Python脚本读取Excel文件，并按Bioproject ID生成各自的SRA ID列表文件。

In [None]:
import pandas as pd
import os

# 定义文件路径
excel_path = '/Users/SUO/Desktop/sratest.xlsx'
output_dir = '/Users/SUO/data/sra'
list_dir = '/Users/SUO/Desktop/lists'

# 读取Excel文件
sra_data = pd.read_excel(excel_path)

# 检查并创建输出目录
os.makedirs(output_dir, exist_ok=True)
os.makedirs(list_dir, exist_ok=True)

# 按Bioproject ID分组
grouped = sra_data.groupby('Bioproject ID')

# 遍历每个Bioproject ID
for bioproject_id, group in grouped:
    print(f"Processing Bioproject ID: {bioproject_id}")
    
    # 生成一个临时文件，包含该Bioproject ID的所有SRA ID
    acc_list_path = os.path.join(list_dir, f"{bioproject_id}_Acc_List.txt")
    group['SRA ID'].to_csv(acc_list_path, index=False, header=False)

print("SRA ID lists have been generated.")


步骤2：使用Shell脚本下载和转换数据
接下来，使用shell脚本读取生成的SRA ID列表，并执行prefetch和fasterq-dump操作

In [None]:
#!/bin/bash

# 定义列表文件目录和输出目录
LIST_DIR="/Users/SUO/Desktop/lists"
OUTPUT_DIR="/Users/SUO/data/sra"

# 检查并创建输出目录
mkdir -p "$OUTPUT_DIR"

# 遍历每个列表文件
for acc_list in "$LIST_DIR"/*.txt; do
    echo "Processing file: $acc_list"
    
    # 读取每个文件中的SRA ID
    while IFS= read -r srr_id; do
        echo "Downloading $srr_id"
        prefetch "$srr_id"
        
        echo "Converting $srr_id"
        fasterq-dump "$srr_id" -O "$OUTPUT_DIR"
    done < "$acc_list"
done

echo "All downloads and conversions are complete."


ncbi sra数据批量下载

In [None]:
#下载安装  E-utilities
sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"
echo "export PATH=\$HOME/edirect:\$PATH" >> $HOME/.bash_profile

#把下列project ID改成自己的
esearch -db sra -query "PRJNA1128369[bioproject]" | efetch -format runinfo | cut -f 1 -d ',' | sed '1d' > PRJNA1128369.txt


In [None]:
#!/bin/bash

# define your SRR_Acc_List.txt path
ACC_LIST_PATH="/Users/SUO/PRJNA1128353.txt"

# define your output path
OUTPUT_DIR="/Users/SUO/data/sra"

# Check and create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# read every single row of SRR number
while IFS= read -r srr_id
do
    echo "Downloading $srr_id"
    prefetch $srr_id
    # option: use fasterq-dump convert to FASTQ file
    echo "Converting $srr_id"
    fasterq-dump $srr_id -O "$OUTPUT_DIR"


# Check if fasterq-dump was successful
    if [[ $? -eq 0 ]]; then
        # Define the SRA file path
        sra_file="${HOME}/ncbi/public/sra/${srr_id}.sra"
        
        # Remove the original SRA file
        if [[ -f "$sra_file" ]]; then
            rm "$sra_file"
            echo "Removed original SRA file: $sra_file"
        else
            echo "SRA file not found: $sra_file"
        fi
    else
        echo "fasterq-dump failed for $srr_id. SRA file not removed."
    fi
done < "$ACC_LIST_PATH"

#get one specific SRR file to certain path:
#prefetch SRR616206 -O /Users/SUO/ncbi/SRP017096

01. Trim data using Cutadapt

In [None]:
conda activate qiime2-amplicon-2024.2

In [None]:
#!/bin/bash

# Define the list of sample IDs
samples=$(cat /Users/SUO/PRJNA1128369.txt)
# Create output directory
output_dir="/Users/SUO/data/sra"
mkdir -p ${output_dir}

# Loop through each sample
for sample in ${samples}; do
  echo "Processing sample ${sample}..."

  # Define input and output file paths
  input_forward="/Users/SUO/data/sra/${sample}_1.fastq"
  input_reverse="/Users/SUO/data/sra/${sample}_2.fastq"
  output_forward="${output_dir}/${sample}_tr_R1.fastq"
  output_reverse="${output_dir}/${sample}_tr_R2.fastq"

  # Run Cutadapt for quality trimming
  cutadapt -q 20 -m 50 --trim-n \
    -o ${output_forward} -p ${output_reverse} \
    ${input_forward} ${input_reverse}

  # Check if Cutadapt ran successfully
  if [[ $? -eq 0 ]]; then
    # Remove the original fastq files if Cutadapt was successful
    rm ${input_forward} ${input_reverse}
    echo "Removed original files: ${input_forward}, ${input_reverse}"
  else
    echo "Cutadapt failed for sample ${sample}. Original files not removed."
  fi

  echo "Finished processing sample ${sample}."
  echo "Output files: ${output_forward}, ${output_reverse}"
done

echo "All samples processed."


In [None]:
#!/bin/bash

# Enter the directory containing FASTQ files
cd /Users/SUO/data/sra || exit


# Create the manifest file header
{
    echo "sample-id    forward-absolute-filepath    reverse-absolute-filepath"
} > tr_manifest1.txt

# Create a text file containing IDs and paths for forward and reverse reads
ls *.fastq | cut -d "_" -f 1 | sort | uniq | while read sample; do
    forward="${PWD}/${sample}_tr_R1.fastq"
    reverse="${PWD}/${sample}_tr_R2.fastq"
    if [[ -f "$forward" && -f "$reverse" ]]; then
        echo -e "${sample}\t${forward}\t${reverse}"
    else
        echo "Warning: Files for sample $sample not found. Skipping."
    fi
done > tr_manifest2.txt

# Merge files
cat tr_manifest1.txt tr_manifest2.txt > tr_manifest.tsv

if [[ $? -eq 0 ]]; then
  # Remove the original manifest files if merging was successful
  rm tr_manifest1.txt tr_manifest2.txt
  echo "Removed original manifest files: tr_manifest1.txt, tr_manifest2.txt"
else
  echo "Merging manifest files failed. Original files not removed."
fi

# Return to the previous directory
cd -



In [None]:
# Import data
qiime tools import \
  --type 'SampleData[PairedEndSequencesWithQuality]' \
  --input-path /Users/SUO/data/sra/tr_manifest.tsv \
  --output-path demux_trimmed.qza \
  --input-format PairedEndFastqManifestPhred33V2

qiime demux summarize \
  --i-data demux_trimmed.qza \
  --o-visualization demux_trimmed_summary.qzv


In [None]:
qiime dada2 denoise-paired \
  --i-demultiplexed-seqs demux_trimmed.qza \
  --p-trunc-len-f 0\
  --p-trunc-len-r 0 \
  --o-table table.qza \
  --o-representative-sequences rep-seqs.qza \
  --o-denoising-stats denoising-stats.qza

In [None]:
qiime metadata tabulate \
  --m-input-file denoising-stats.qza \
  --o-visualization denoising-stats.qzv

qiime feature-table summarize \
  --i-table table.qza \
  --o-visualization table.qzv \
  --m-sample-metadata-file /Users/SUO/data/sra/tr_manifest.tsv


qiime feature-table tabulate-seqs \
  --i-data rep-seqs.qza \
  --o-visualization rep-seqs.qzv
