In [None]:
import pandas as pd
import re
import random
from tqdm import tqdm
import numpy as np

In [None]:
# Load data from the CSV file
csv_path = '/Path/to/genomes.csv'
genome_data = pd.read_csv(csv_path)

# Regular expression pattern to identify SSRs
ssr_pattern = r'(\w+?)\1{4,50}'  # Pattern for SSRs with 4 to 50 repeats

# Function to identify unique, minimal rotations of SSRs to standardize SSR matches
def smallest_rotation(ssr):
    return min(ssr[i:] + ssr[:i] for i in range(len(ssr)))

In [None]:
# Function to process a chunk of sequence and find SSRs within it
def process_chunk(sequence_chunk, start_pos, seen_positions):
    chunk_ssrs = []  # List to store the unique SSRs found in this chunk of sequence
    
    # Use regular expression to find all possible SSRs in the current sequence chunk
    for match in re.finditer(ssr_pattern, sequence_chunk):  
        # Standardize the SSR by finding the smallest rotation (this makes it easier to compare SSRs)
        ssr = smallest_rotation(match.group(0)) 
        
        # Calculate the start position of the SSR in the entire genome
        ssr_start = start_pos + match.start() 
        
        # Calculate the end position of the SSR in the genome
        ssr_end = ssr_start + len(ssr) - 1  # Subtract 1 to account for zero-based indexing
        
        # Check if this SSR is already present in the `seen_positions` list (to avoid overlaps)
        # `seen_positions` stores the start and end positions of already encountered SSRs
        if not any(start <= ssr_start <= end or start <= ssr_end <= end for start, end in seen_positions):
            # If the SSR doesn't overlap with any previously seen SSRs, add it to the list
            chunk_ssrs.append((ssr, ssr_start))  # Store the SSR and its start position in the chunk
            seen_positions.append((ssr_start, ssr_end))  # Track the start and end positions of the found SSR

    return chunk_ssrs  # Return the list of SSRs found in this chunk

# Function to find SSRs using a sliding window approach on the genome sequence
def sliding_window_ssr_finder(genome_sequence, chunk_size, step_size):
    total_bases = len(genome_sequence)  # Get the total number of bases in the genome sequence
    unique_ssrs = {}  # Dictionary to store unique SSRs and their positions
    seen_positions = []  # List to keep track of start and end positions of already encountered SSRs

    # Iterate over the genome using a sliding window approach (with specified chunk size and step size)
    for start in tqdm(range(0, total_bases - chunk_size + 1, step_size), desc="Finding SSRs"):
        # Extract the current chunk of sequence based on the start position and chunk size
        sequence_chunk = genome_sequence[start:start + chunk_size]
        
        # Process this chunk to find SSRs and get their positions
        chunk_ssrs = process_chunk(sequence_chunk, start, seen_positions)

        # Store the SSRs found in this chunk
        for ssr, pos in chunk_ssrs:
            if ssr not in unique_ssrs:
                unique_ssrs[ssr] = pos  # Add the SSR and its position to the dictionary if it is not already there

    return unique_ssrs  # Return the dictionary of unique SSRs and their positions

# Parameters for SSR search
chunk_size = 10000
step_size = 100

# List to hold the total SSR count for each iteration
ssr_details_per_iteration = []

# Combine all sequences into one large genome sequence
genome_sequence = ''.join(genome_data['Sequence'])

# Loop 100 times, shuffle the genome sequence, and collect SSR counts for each randomization
for iteration in range(10):
    print(f"Iteration {iteration + 1}")
    
    # Shuffle the genome sequence at the base level
    shuffled_genome_sequence = ''.join(np.random.permutation(list(genome_sequence)))

    # Extract unique SSRs from the shuffled genome
    unique_ssrs = sliding_window_ssr_finder(shuffled_genome_sequence, chunk_size, step_size)
    
    # Count the SSRs found in this iteration and store the count
    total_ssrs = len(unique_ssrs)
    for ssr, pos in unique_ssrs.items():
        ssr_details_per_iteration.append({'Iteration': iteration + 1, 'SSR': ssr, 'Position': pos})

# Convert the list of SSR counts into a DataFrame
ssr_counts_df = pd.DataFrame(ssr_details_per_iteration)

In [None]:
# Define the output file where the SSR details will be saved
output_file = 'output.txt'

# Open the output file in write mode ('w')
with open(output_file, 'w') as f:  
    # Iterate through the list 'ssr_details_per_iteration' which contains SSR details for each iteration
    for record in ssr_details_per_iteration:  
        # Write each record's SSR details to the file in a readable format
        # Format: "Iteration: <iteration>, SSR: <SSR>, Position: <position>"
        f.write(f"Iteration: {record['Iteration']}, SSR: {record['SSR']}, Position: {record['Position']}\n")  

# Confirmation
print(f"SSR details for all iterations have been written to {output_file}") 