# Conserved TAD example 1 region

In [9]:
npb chr01:2500000-2900000
oruf chr01:2200000-2600000

Chromosome .fna files created in the 'NPB' folder.


In [6]:
#extracting region sequences from genomes: npb_exampleX.fna, oruf_exampleX.fna

In [7]:
def extract_sequence_from_genomic_coordinates(fasta_file, chromosome, start, end, output_file):
    sequence = ''
    with open(fasta_file, 'r') as infile:
        current_header = ''
        for line in infile:
            if line.startswith('>'):
                current_header = line.strip()
            elif current_header == f'>{chromosome}':
                sequence += line.strip()

    start_index = start - 1  # Adjusting to 0-based indexing
    end_index = end  # No need to adjust the end index

    extracted_sequence = sequence[start_index:end_index]

    with open(output_file, 'w') as outfile:
        outfile.write(f">{chromosome}:{start}-{end}\n{extracted_sequence}\n")

# Define input and output files
fasta_file = '/scratch/ak8725/genomes/orufi.fna'
chromosome = 'chr01'
start_coordinate = 2200000
end_coordinate = 2600000
output_file = '/scratch/ak8725/dotplots/oruf_example1.fna'

# Extract the sequence for the specified region
extract_sequence_from_genomic_coordinates(fasta_file, chromosome, start_coordinate, end_coordinate, output_file)

# 2.Running mummer, gnuplot and converting .ps into .png format with combined output. Visualization of collinearity with dotplots.

sbatch dotplot_region.sh

# 4. Identifying SNPs, SV coordinates

In [3]:
Did not do this, all analysis done by MUM&Co
#filter delta file (optional)
#the inv coordinates were the same with filtered and non-filtered delta files
#I used unfiltered for the rest of rearrangements
#delta-filter -m -i 90 -l 100 ./NPB-az_alignments/chr06.delta > ./NPB-az_alignments/chr06_filtered.delta
#convert delta file into .coords file
# module load mummer/intel/4.0.0rc1
# show-coords -Trd example1.delta > example1.coords
# look into .coords file and identify blocks with -1 in the FRM column

In [6]:
#detect SVs with mum&co 
#https://github.com/SAMtoBAM/MUMandCo
module purge
module load samtools/intel/1.14
module load mummer/intel/4.0.0rc1
bash mumandco_v3.8.sh -r npb_example1.fna -q oruf_example1.fna -g 400000 -o example1


Nucmer alignment of genomes, filtering and converting to coordinates

                               #
                              # #
                             #   #
                ###############################
                #                             #
                # MUM&Co is open for business #
                #           version 3.8       #
                ###############################


######################################################################################################
          USING GLOBAL ALIGNMENT COORDINATES FOR DELETIONS, INSERTIONS AND TRANSLOCATIONS
######################################################################################################


Matching query and reference chromosomes

98.1264

Finding alignment gaps


Filtering for size labelling SV


Finding translocation fragments


Checking alignment sense for inversions involving majority of single chromosome bases



######################################################

In [17]:
#extract query (oruf) coordinates of SVs into bed file, creating 1_svs_query.bed
#it is to be plotted in coolbox with the query (oruf)
def create_bed_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        # Skip the header
        next(infile)
        # Iterate over each line in the input file
        for line in infile:
            # Split the line into columns
            columns = line.strip().split('\t')
            # Check if the line has enough columns
            if len(columns) >= 8:
                # Extract required information
                start = int(columns[6]) + 2200000 - 1  # Extract query_start
                end = int(columns[7]) + 2200000 - 1  # Extract query_stop
                sv_type = columns[5]  # Extract SV_type
                # Write to the output file in BED format
                outfile.write(f"chr01\t{start}\t{end}\t{sv_type}\t1\t+\n")
            else:
                print(f"Issue parsing line: {line}")

# Define input and output file names
input_file = './example1_output/example1.SVs_all.tsv'
output_file = '1_svs_query.bed'

# Create the BED file
create_bed_file(input_file, output_file)

In [18]:
# process the file to make it compatible for coolbox
# Define the file path
file_path = '1_svs_query.bed'

# Read the file, process the lines, and write the modified lines back to the file
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process each line
modified_lines = []
for line in lines:
    # Split the line into columns
    columns = line.split('\t')
    # Check if the value in col3 is equal to the value in col2
    if int(columns[2]) == int(columns[1]):
        # Increment the value in col3 by 1
        columns[2] = str(int(columns[2]) + 1)
    # Join the columns back into a line
    modified_line = '\t'.join(columns)
    modified_lines.append(modified_line)

# Write the modified lines back to the file
with open(file_path, 'w') as file:
    file.writelines(modified_lines)

In [19]:
#create separate file for ins,del,dup
# Define the input file path
input_file = '1_svs_query.bed'

# Define output file paths
output_files = {
    'duplication': '1_dup.bed',
    'insertion': '1_ins.bed',
    'deletion': '1_del.bed'
}

# Open the input file and read lines
with open(input_file, 'r') as infile:
    # Iterate through each line in the input file
    for line in infile:
        # Split the line into columns
        columns = line.strip().split('\t')
        # Extract the SV type from the fourth column
        sv_type = columns[3]
        # Write the line to the appropriate output file based on the SV type
        with open(output_files.get(sv_type), 'a') as outfile:
            outfile.write(line)