# Generate n-gram data 

In [2]:
import numpy as np

def create_transition_tensor(alphabet_size):
    """
    Create a transition tensor for a trigram model.
    
    Args:
    alphabet_size (int): Size of the alphabet (S).
    
    Returns:
    np.ndarray: Transition tensor of shape (alphabet_size, alphabet_size, alphabet_size).
    """
    transition_tensor = np.random.rand(alphabet_size, alphabet_size, alphabet_size)
    transition_tensor = transition_tensor / transition_tensor.sum(axis=2, keepdims=True)  # Normalize to create probabilities
    return transition_tensor

def generate_sequence_from_trigram_model(transition_tensor, sequence_length):
    """
    Generate a sequence using a trigram model.
    
    Args:
    transition_tensor (np.ndarray): Transition tensor of shape (alphabet_size, alphabet_size, alphabet_size).
    sequence_length (int): Length of the sequence to generate (T).
    
    Returns:
    np.ndarray: Generated sequence of length sequence_length.
    """
    alphabet_size = transition_tensor.shape[0]
    sequence = np.zeros(sequence_length, dtype=int)
    
    # Start with two random tokens
    sequence[0] = np.random.randint(alphabet_size)
    sequence[1] = np.random.randint(alphabet_size)
    
    # Generate the rest of the sequence
    for t in range(2, sequence_length):
        previous_token_1 = sequence[t - 2]
        previous_token_2 = sequence[t - 1]
        next_token = np.random.choice(alphabet_size, p=transition_tensor[previous_token_1, previous_token_2])
        sequence[t] = next_token
    
    return sequence

def generate_sequences_from_trigram_model(alphabet_size, sequence_length, num_sequences):
    """
    Generate multiple sequences using a trigram model.
    
    Args:
    alphabet_size (int): Size of the alphabet (S).
    sequence_length (int): Length of each sequence (T).
    num_sequences (int): Number of sequences to generate.
    
    Returns:
    np.ndarray: Array of shape (num_sequences, sequence_length) containing the generated sequences.
    """
    transition_tensor = create_transition_tensor(alphabet_size)
    sequences = np.array([generate_sequence_from_trigram_model(transition_tensor, sequence_length) for _ in range(num_sequences)])
    return sequences

# Parameters
S = 5  # Alphabet size
T = 25  # Sequence length
num_sequences = 1000  # Number of sequences to generate

# Generate sequences using a trigram model
sequences = generate_sequences_from_trigram_model(S, T, num_sequences)
print("Generated Sequences:")
print(sequences[:5])  # Print the first 5 sequences for inspection


Generated Sequences:
[[3 4 4 3 1 2 3 3 2 1 3 0 1 1 4 0 4 0 0 1 4 0 4 2 2]
 [3 2 2 1 2 3 3 1 0 4 3 1 1 4 4 1 0 0 3 3 0 2 3 2 0]
 [0 4 1 0 1 0 3 2 4 1 1 0 3 4 4 2 0 2 0 1 1 4 1 0 1]
 [3 3 1 1 1 0 3 4 4 4 1 0 4 1 1 3 2 2 0 1 3 3 0 0 3]
 [4 0 2 0 4 1 1 1 1 0 1 1 3 4 1 1 3 2 2 1 2 1 0 3 4]]


In [3]:
# Save the sequences to a file
np.save("sequences.npy", sequences)
