Hello user! Here is a little guide on how to use this code:

To create a sequence that only has within-sequence Markovian dependencies, run the first section. If you instead have an input that you want to have a response to, run the second section. You probably don't need to run the third section, it's old and I kept it only as a reference, just in case.

1. Make sure this notebook and the PrettyDat.csv file are in the same folder.
 
2. Whenever you are restarting this notebook, run the 'GLOBAL' cell at the top of this document to install the dependencies. 

3. The first time you are running this code, you will have to run the 'GENERATE CORPORA' cell make the corpus text files for each species from the PrettyDat.csv file that accompanied this notebook. This cell will save the .txt files to the same folder that this notebook and the PrettyDat.csv file are in.

4. **Only edit the User Input section of the next cell.** I've added some annotations there to help you know what each variable means.

5. Run the cell! That's it, easy peasy :) The generated output will be printed below the chunk.

-Ananya

## 2nd Order Markov Generation for Single Fish, no interaction

In [1]:
######## GLOBAL ########
import random
import pandas as pd
from collections import defaultdict, Counter

In [None]:
########## GENERATE CORPORA ############
PrettyDat = pd.read_csv('PrettyDat.csv')

def create_corpus_by_species(df, by_length=True):
    """
    Create dom/sub/full corpora split by species.
    
    Args:
        df: dataframe with columns ['species','trial_id','sequence_id','fish_id','order_id','state','length']
        out_prefix: prefix for output files
        by_length: whether to expand states by 'length'
    """
    # Loop over each species
    for species, species_df in df.groupby('species'):
        dom_filename  = f"corpus_{species}_dom.txt"
        sub_filename  = f"corpus_{species}_sub.txt"
        full_filename = f"corpus_{species}_full.txt"

        grouped = species_df.groupby(['trial_id', 'sequence_id'])

        with open(dom_filename, 'w') as f, open(sub_filename, 'w') as g, open(full_filename, 'w') as h:
            for (trial_id, seq_id), group in grouped:
                # Process dom then sub
                for idx, fish_role in enumerate(['dom', 'sub']):
                    role_group = group[group['fish_id'] == fish_role].sort_values('order_id')
                    sequence_parts = []
                    for _, row in role_group.iterrows():
                        # Repeat state by_length
                        if by_length:
                            chars = [str(row['state'])] * int(row['length'])
                        else:
                            chars = [str(row['state'])]

                        sequence_parts.append(''.join(chars))

                    sequence_line = "<s>" + ''.join(sequence_parts) + '</s>\n'

                    if idx == 0:
                        f.write(sequence_line)
                    else:
                        g.write(sequence_line)

                    h.write(sequence_line)

#create corpora by species and role, save to same folder
create_corpus_by_species(PrettyDat, by_length=True)


In [2]:
############## USER INPUT #####################
species = 'bre' #choose from 'bre', 'mil', 'mul', 'oce', 'orn', 'pul'
type = 'dom' #choose what corpus to reference for conditionals: 'dom' (dom only corpus), 'sub' (sub only corpus), 'full' (dom and sub corpora combined)
start_pair = ('a', 'e') #choose starting pair of states for generation
steps = 30 #how many state transitions to generate
############ NOW JUST RUN :) ##################

### helper functions: ###
def read_lines(filename):
    """Read corpus by line"""
    lines = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Remove <s> and </s>
            line = line.replace("<s>", "").replace("</s>", "").strip()
            seq = list(line)
            lines.append(seq)
    return lines

def find_length_dists(lines):
    """
    For each state, find distribution of lengths.
    Returns:
      - length_dists: dict[state][length] = probabilities
      - df_long: tidy DataFrame with columns [state, length, prob], for reference
    """
    lengths = defaultdict(Counter)
    
    for seq in lines:
        run_char, run_len = seq[0], 1
        for s in seq[1:]:
            if s == run_char:
                run_len += 1
            else:
                lengths[run_char][run_len] += 1
                run_char, run_len = s, 1
        lengths[run_char][run_len] += 1 
    
    # normalize
    length_dists = {}
    tidy_rows = []
    for state, counter in lengths.items():
        total = sum(counter.values())
        length_dists[state] = {l: c / total for l, c in counter.items()}
        for l, c in counter.items():
            tidy_rows.append({"state": state, "length": l, "prob": c / total})
    
    #make a handy table for reference 
    df_long = pd.DataFrame(tidy_rows)

    return length_dists, df_long

def collapse_length(seq):
    """Collapse length to find 2nd order Markov dependencies."""
    collapsed = []
    for s in seq:
        if not collapsed or collapsed[-1] != s:
            collapsed.append(s)
    return collapsed

def find_transitions(lines):
    """
    Build transitions based on the last two unique states.
    Returns dict: (prev_state, last_state) -> {next_state: prob}
    """
    transitions = defaultdict(Counter)
    
    for seq in lines:
        collapsed = collapse_length(seq)
        for t in range(2, len(collapsed)):
            prev2, prev1, curr = collapsed[t-2], collapsed[t-1], collapsed[t]
            transitions[(prev2, prev1)][curr] += 1
    
    # Normalize
    normalized = {}
    for cond, counter in transitions.items():
        total = sum(counter.values())
        normalized[cond] = {s: c / total for s, c in counter.items()}
    return normalized

def sample_from_dist(dist):
    """Sample from distribution, weighted by probability"""
    states, probs = zip(*dist.items())
    return random.choices(states, weights=probs, k=1)[0]

def sim_2nd_ord_markov(trans, length_dists, start_pair, steps):
    """
    Simulate sequence with unique-state transitions AND sampled lengths.
    start_pair: tuple (state1, state2)
    Returns expanded sequence with durations.
    """
    seq_states = [start_pair[0], start_pair[1]]
    expanded_seq = []
    
    # expand start_pair with sampled lengths
    for state in seq_states:
        if state in length_dists:
            dur = sample_from_dist(length_dists[state])
            expanded_seq.extend([state] * dur)
        else:
            expanded_seq.append(state)
    
    for t in range(2, steps):
        cond = (seq_states[-2], seq_states[-1])
        if cond in trans:
            next_state = sample_from_dist(trans[cond])
        else:
            next_state = random.choice(list({s for _, v in trans.items() for s in v}))
        
        seq_states.append(next_state)
        
        # expand using sampled length
        if next_state in length_dists:
            dur = sample_from_dist(length_dists[next_state])
            expanded_seq.extend([next_state] * dur)
        else:
            expanded_seq.append(next_state)
    
    return expanded_seq

### generation: ###
lines = read_lines(f"corpus_{species}_{type}.txt") #read corpus

trans = find_transitions(lines) #record two-state transitions

length_dists, df_long = find_length_dists(lines) #record length distributions per state

# generate! :)
sim_seq = sim_2nd_ord_markov(trans, length_dists, start_pair = start_pair, steps = steps)
print("Generated:", "".join(sim_seq))


Generated: aaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaa

## A little experiment: 2 state markovian generation within random markovian environment

**Let D be a dominant sequence in one trial, and S be the subordinate sequence in the same trial. D and S are random variables with Markov chain properties:**

$$P(d_t|d_1,...,d_{t-1}) = P(d_t | d_{t-1})$$

$$P(s_t|s_1,...,s_{t-1}) = P(s_t | s_{t-1})$$

**But D and S are also somehow dependent on each other, since there is feedback between fish during communication:**

Let: 

$r_1$ = the dominant's reaction time

$r_2$ = the subordinate's reaction time

$$P(d_t|d_1,...,d_{t-1}) = P(d_t|d_{t-1}, s_{t-r_{1}})$$

$$P(s_t|s_1,...,s_{t-1}) = P(s_t|s_{t-1}, d_{t-r_{2}})$$

**Furthermore, from my R analyses there seems to be evidence of higher-order Markov dependencies within D and S separately (2nd-order at least)**

Let:

D = dominant sequence with length

S = subordinate sequence with length

D' = dominant sequence collapsed by length

S' = subordinate sequence collapsed by length

$r_1$ = the dominant's reaction time

$r_2$ = the subordinate's reaction time

$$P(d_t|d_1,...,d_{t-1}) = P(d_t|d'_{t-1}, d'_{t-2}, s_{t-r_{1}})$$

$$P(s_t|s_1,...,s_{t-1}) = P(s_t|s'_{t-1}, s'_{t-2}, d_{t-r_{2}})$$



**Thoughts:**

While the physical postures of fish are quite likely to be Markovian, the *meaning* behind postures may not necessarily be structured in a Markovian way. This means that the 'responses' between fish may not be Markov-related...Therefore, it is not so valid to assume that the relationship between D and S is Markovian, even if they are separately Markov chains.... This makes the assumption that their reactions to each other are sequential

but what else to do?

Have set both fish's reaction times to 1 frame, around 1ms which is what most research seems to say (although it is a bit unclear... I would guess they are actually a bit faster than this, but this is the highest resolution that we can do since the camera is only 60fps)


- maybe to do: look at number of combinations in 1st, 2nd, 3rd, etc. order markovs--if there is underlying higher structure/motifs, the number of transitions should plateau at some point

In [4]:
############## USER INPUT #####################
species = 'mul' #choose from 'bre', 'mil', 'mul', 'oce', 'orn', 'pul'
sub_seq = 'ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggffffffffbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccgggggggggggggghhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhccccccccccccccccccccccccccggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccgggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggghhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhkkkkkkkkkkkkkkkkkkkkkkkkkkcccccccccccccddddddddddddddddddddddddddddddddddddddddddddddddjjjjjjjjjjjjjjjjjjjjjjaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'
start_pair = ('a', 'e') #choose starting pair of states for generation
steps = 20 #how many states to generate (for coupled markov only)
############ NOW JUST RUN :) ##################

### helper functions: ###
#read lines from both corpora
def load_both_lines(dom_file, sub_file):
    """Read dom/sub corpora as aligned lists of sequences (per line)."""

    dom_lines = []
    sub_lines = []
    with open(dom_file) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Remove <s> and </s>
            line = line.replace("<s>", "").replace("</s>", "").strip()
            seq = list(line)
            dom_lines.append(seq)
        #dom_lines = [line.strip().replace("<s>", "").replace("</s>", "").split() for line in f]
    with open(sub_file) as g:
        for line in g:
            line = line.strip()
            if not line:
                continue
            # Remove <s> and </s>
            line = line.replace("<s>", "").replace("</s>", "").strip()
            seq = list(line)
            sub_lines.append(seq)
        #sub_lines = [line.strip().replace("<s>", "").replace("</s>", "").split() for line in g]
    
    # Ensure same number of sequences
    assert len(dom_lines) == len(sub_lines), "mismatched sequence numbers :("
    return dom_lines, sub_lines

def find_length_dists(lines):
    """
    For each state, find distribution of lengths.
    Returns:
      - length_dists: dict[state][length] = probabilities
      - df_long: tidy DataFrame with columns [state, length, prob], for reference
    """
    lengths = defaultdict(Counter)
    
    for seq in lines:
        run_char, run_len = seq[0], 1
        for s in seq[1:]:
            if s == run_char:
                run_len += 1
            else:
                lengths[run_char][run_len] += 1
                run_char, run_len = s, 1
        lengths[run_char][run_len] += 1 
    
    # normalize
    length_dists = {}
    tidy_rows = []
    for state, counter in lengths.items():
        total = sum(counter.values())
        length_dists[state] = {l: c / total for l, c in counter.items()}
        for l, c in counter.items():
            tidy_rows.append({"state": state, "length": l, "prob": c / total})
    
    # make a handy table for reference 
    df_long = pd.DataFrame(tidy_rows)

    return length_dists, df_long

def collapse_length(seq):
    """Collapse length to find 2nd order Markov dependencies of d'/s' sequences."""
    collapsed = []
    for s in seq:
        if not collapsed or collapsed[-1] != s:
            collapsed.append(s)
    return collapsed

def coupled_transitions(dom_lines, sub_lines):
    """
    Build coupled 2nd-order Markov transitions:
      Dom: P(d_t | d'_{t-2}, d'_{t-1}, s_{t-1})
      Sub: P(s_t | s'_{t-2}, s'_{t-1}, d_{t-1})
    """

    dom_transitions = defaultdict(Counter)
    sub_transitions = defaultdict(Counter)
    
    for dom_seq, sub_seq in zip(dom_lines, sub_lines):
        # Collapse runs for state-based dependencies
        collapsed_dom = collapse_length(dom_seq)
        collapsed_sub = collapse_length(sub_seq)

        # need to align collapsed indices with token indices
        # Build maps from collapsed index → token index (last token of each run)
        dom_run_ends = []
        sub_run_ends = []
        
        # indices where each collapsed run ends
        run_char = dom_seq[0]
        for i, s in enumerate(dom_seq[1:], start=1):
            if s != run_char:
                dom_run_ends.append(i-1)
                run_char = s
        dom_run_ends.append(len(dom_seq)-1)
        
        run_char = sub_seq[0]
        for i, s in enumerate(sub_seq[1:], start=1):
            if s != run_char:
                sub_run_ends.append(i-1)
                run_char = s
        sub_run_ends.append(len(sub_seq)-1)
        
        L = min(len(collapsed_dom), len(collapsed_sub))
        
        # Iterate through collapsed states
        for t in range(2, L):
            # dom conditional
            cond_dom = (
                collapsed_dom[t-2],           # d'_{t-2}
                collapsed_dom[t-1],           # d'_{t-1}
                sub_seq[dom_run_ends[t-1]]    # s_{t-1} = last token of that sub run
            )
            dom_next = collapsed_dom[t]
            dom_transitions[cond_dom][dom_next] += 1

            # sub conditional
            cond_sub = (
                collapsed_sub[t-2],           # s'_{t-2}
                collapsed_sub[t-1],           # s'_{t-1}
                dom_seq[sub_run_ends[t-1]]    # d_{t-1} = last token of that dom run
            )
            sub_next = collapsed_sub[t]
            sub_transitions[cond_sub][sub_next] += 1
    
    # Normalize into probabilities
    def normalize(trans):
        return {
            cond: {k: v / sum(c.values()) for k, v in c.items()} 
            for cond, c in trans.items()
        }
    
    return normalize(dom_transitions), normalize(sub_transitions)

def sample_from_dist(dist):
    states, probs = zip(*dist.items())
    return random.choices(states, weights=probs, k=1)[0]

def simulate_coupled_markov(dom_trans, sub_trans,
                            dom_length_dists, sub_length_dists,
                            start_pair, steps=20):
    """
    Jointly simulate Dom and Sub with:
      Dom ~ P(d_t | d'_{t-2}, d'_{t-1}, s_{t-1})
      Sub ~ P(s_t | s'_{t-2}, s'_{t-1}, d_{t-1})
    """

    # Track collapsed states
    dom_states = [start_pair[0], start_pair[1]]
    sub_states = [start_pair[0], start_pair[1]]

    # Track expanded tokens
    dom_expanded = []
    sub_expanded = []

    # Expand initial states
    for state in dom_states:
        dur = sample_from_dist(dom_length_dists[state])
        dom_expanded.extend([state] * dur)
    for state in sub_states:
        dur = sample_from_dist(sub_length_dists[state])
        sub_expanded.extend([state] * dur)

    # Generate subsequent states
    for t in range(2, steps):
        # dom update
        sub_prev_token = sub_expanded[-1]  # last sub *token*
        cond_dom = (dom_states[-2], dom_states[-1], sub_prev_token)
        if cond_dom in dom_trans:
            dom_next = sample_from_dist(dom_trans[cond_dom])
        else:
            dom_next = random.choice(list({s for dist in dom_trans.values() for s in dist}))
        dom_states.append(dom_next)

        dur = sample_from_dist(dom_length_dists[dom_next])
        dom_expanded.extend([dom_next] * dur)

        #sub update
        dom_prev_token = dom_expanded[-1]  # last dom *token*
        cond_sub = (sub_states[-2], sub_states[-1], dom_prev_token)
        if cond_sub in sub_trans:
            sub_next = sample_from_dist(sub_trans[cond_sub])
        else:
            sub_next = random.choice(list({s for dist in sub_trans.values() for s in dist}))
        sub_states.append(sub_next)

        dur = sample_from_dist(sub_length_dists[sub_next])
        sub_expanded.extend([sub_next] * dur)

    return dom_expanded, sub_expanded

def generate_dom_given_sub(sub_seq, dom_trans, dom_length_dists, start_states):
    """
    Generate Dom sequence conditioned on full Sub sequence.
    Uses P(d_t | d'_{t-2}, d'_{t-1}, s_{t-1}).
    """
    dom_states = [start_states[0], start_states[1]] 
    dom_expanded = []

    # Expand initial states
    for state in dom_states:
        dur = sample_from_dist(dom_length_dists[state])
        dom_expanded.extend([state] * dur)

    for t in range(1, len(sub_seq)):
        sub_prev_token = sub_seq[t-1]

        cond = (dom_states[-2], dom_states[-1], sub_prev_token)
        if cond in dom_trans:
            dom_next = sample_from_dist(dom_trans[cond])
        else:
            dom_next = random.choice(list({s for dist in dom_trans.values() for s in dist}))
        dom_states.append(dom_next)

        dur = sample_from_dist(dom_length_dists[dom_next])  
        dom_expanded.extend([dom_next] * dur)

        if len(dom_expanded) >= len(sub_seq):
            dom_expanded = dom_expanded[:len(sub_seq)]
            break

    return dom_expanded

### generation: ###
dom_lines, sub_lines = load_both_lines(f"corpus_{species}_dom.txt", f"corpus_{species}_sub.txt")

dom_trans, sub_trans = coupled_transitions(dom_lines, sub_lines)

dom_length_dists, dom_df_long = find_length_dists(dom_lines)
sub_length_dists, sub_df_long = find_length_dists(sub_lines)

#coupled markov (both fish jointly generated, basically the computer talking to itself)
#you probably won't need to use this
#dom_seq, sub_seq = simulate_coupled_markov(dom_trans, sub_trans,
#                            dom_length_dists, sub_length_dists,
#                            start_pair, steps= steps)

#print("Dom:", "".join(dom_seq))
#print("Sub:", "".join(sub_seq))

#response to input sequence (one fish response generated to given input sequence)
dom_seq = generate_dom_given_sub(sub_seq, dom_trans, dom_length_dists, start_pair)

print("Generated:", "".join(dom_seq))


Generated: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeehhhhhhhhhheeeeeeeeeeeeeecccccccccccccccccccddddddddddddddddddddddddddddddddddddddddddddddddddddddeeeeeeeeeeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaccccchhhhhhcccchhhhhhhcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccckkkhhhcccccccccccccccccccccccccccchhhhhhccccceeeeeeeeeeehhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh

## Old, Don't Use: 1st-Order Markov Generation for Single Fish

In [8]:

def load_single_corpus_lines(filename):
    """Read dom/sub corpora as aligned lists of sequences (per line)."""
    lines = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            # Remove <s> and </s>
            line = line.replace("<s>", "").replace("</s>", "").strip()
            # Treat each character as a state
            seq = list(line)
            if len(seq) >= 3:  # must have at least 3 for transitions
                lines.append(seq)
    return lines

def single_transitions(lines):
    transitions = defaultdict(Counter)
    
    for seq in lines:
        L = len(seq)        
        
        for t in range(1, L-1):
            transitions[seq[t]][seq[t+1]] += 1
    
    # normalize
    def normalize(trans):
        return {cond: {k: v/sum(c.values()) for k,v in c.items()} 
                for cond,c in trans.items()}
    
    return normalize(transitions)


def sample_from_distribution(dist):
    states, probs = zip(*dist.items())
    return random.choices(states, weights=probs, k=1)[0]

def simulate_single(trans, start_state, steps=500):
    sim_seq = [start_state]
    
    for t in range(1, steps):
        # Dom depends on dom[t-1], sub[t-1]
        cond = sim_seq[-1]
        if cond in trans:
            next = sample_from_distribution(trans[cond])
        else:
            next = random.choice(list({k[0] for k in trans}))
        sim_seq.append(next)
    
    return sim_seq

#load species
lines = load_single_corpus_lines("corpus_mul_dom.txt")

# Build transition models
trans = single_transitions(lines)

#stating state for each fish
start_state = 'e'

#generate 1000 states
test_seq= simulate_single(trans, start_state, steps=1000)
print("".join(test_seq))




eeeeeeehccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccchhhhhhhhhhhhhhhhhhhkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccdddddddcccccccccccccccccccccccddddddddddddddddddddddddddfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffgggggggggggggggccccccckkkkkkkkkkkkkkkkkkkkkkkkkkccccccccccccccccccccccccccccccceeeeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacccccccccccccaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeecccccccccccccccccccccccccccccccccccccccccccccccccccckkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkccccccccccccccccccaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacccccc