<a href="https://colab.research.google.com/github/atulchander/Concatenate_RNA/blob/main/NEXTFLOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env nextflow

// Parameters
params.folder_name = '/content/drive/MyDrive/Google colab/foldername'
params.output_folder = "${params.folder_name}/concatenated_files"
params.checkpoint_file = "${params.output_folder}/concatenation_checkpoint.txt"

// Helper function to determine the format of filenames
def determineFormat(example_filenames) {
    def pattern = ""
    for (example in example_filenames) {
        def example_pattern = example.replaceAll(/\w+/, r'(\\w+)')
        example_pattern = example_pattern.replaceAll(/\./, r'\\.')
        if (pattern.isEmpty()) {
            pattern = example_pattern
        } else {
            pattern = pattern.toCharArray().collect { p, i ->
                (i < example_pattern.size() && p == example_pattern[i]) ? p : r'(.*)'
            }.join('')
        }
    }
    return pattern
}

// Process to determine filename format and extract keys
process determineFilenameFormat {
    input:
    path files

    output:
    val regexPattern into patternChannel

    script:
    // Example filenames to infer the pattern (you can define a few manually)
    def example_filenames = files.take(2).collect { it.name }
    def pattern = determineFormat(example_filenames)
    emit(pattern)
}

// Process to remove duplicate files
process removeDuplicates {
    input:
    path file

    output:
    path file into uniqueFiles

    script:
    def baseName = file.name.replaceAll(/\.fastq.*/, '')
    def fileSize = file.size()
    def uniqueKey = "${baseName}_${fileSize}"

    if (file.exists()) {
        if (state.unique_files.containsKey(uniqueKey)) {
            println "Duplicate found: ${file} (removing)"
            file.delete()
        } else {
            state.unique_files[uniqueKey] = file
        }
    }
}

// Process to concatenate files
process concatenateFiles {
    input:
    path files
    val regexPattern from patternChannel

    output:
    path "${params.output_folder}/*.fastq.gz"

    script:
    def sampleId = files.name =~ regexPattern ? (files.name =~ regexPattern)[0][1] : null
    def readDirection = files.name =~ regexPattern ? (files.name =~ regexPattern)[0][2] : null
    def key = "${sampleId}_${readDirection}"

    if (!completedFiles.contains(key)) {
        def outputFile = "${params.output_folder}/${key}_combined.fastq.gz"
        println "Concatenating ${files.size()} files into ${outputFile}"
        cat "${files}" > "${outputFile}"
        checkpointFile << "${key}\n"
    } else {
        println "Skipping ${key} as it's already concatenated."
    }
}

// Workflow
workflow {
    files = Channel.fromPath("${params.folder_name}/*.fastq*")

    // Determine the pattern for filenames
    files | determineFilenameFormat

    // Initialize an empty map to store unique files
    unique_files = [:]
    files | removeDuplicates

    // Load the list of completed files
    completedFiles = new File(params.checkpoint_file).exists() ?
        new File(params.checkpoint_file).text.readLines() :
        []

    // Group files, extract the key and concatenate
    files.groupTuple()
         .collect { it.name }
         | concatenateFiles
}