In [None]:
import torch
import torch.nn as nn
import numpy as np
import itertools

# Function to generate random RNA sequences
def generate_sample_sequences(num_sequences, sequence_length):
    nucleotides = ['A', 'U', 'C', 'G']
    return [''.join(np.random.choice(nucleotides, sequence_length)) for _ in range(num_sequences)]

# Function to get 3-nt subsequences
def get_3_nt_subsequences(sequence):
    return [sequence[i:i+3] for i in range(len(sequence) - 2)]

# Generate the sample sequences
sample_sequences = generate_sample_sequences(num_sequences=100, sequence_length=1001)

# Flatten the list of subsequences
subsequences = list(itertools.chain.from_iterable(get_3_nt_subsequences(seq) for seq in sample_sequences))

# Create a unique index for each subsequence
unique_subsequences = set(subsequences)
subseq_to_idx = {subseq: idx for idx, subseq in enumerate(unique_subsequences)}

# Convert sequences into lists of indices
sequences_of_indices = [[subseq_to_idx[subseq] for subseq in get_3_nt_subsequences(seq)] for seq in sample_sequences]

# Create the embedding layer
embedding_dim = 10  # Size of the embedding vector
embedding_layer = nn.Embedding(num_embeddings=len(unique_subsequences), embedding_dim=embedding_dim)

# Randomly initialize weights (for demonstration purposes; in practice they would be learned)
embedding_layer.weight.data.uniform_(-0.1, 0.1)

# Convert the padded_sequences to a tensor and pass through the embedding layer
input_sequences = torch.LongTensor(padded_sequences)
embeddings = embedding_layer(input_sequences)

# Check the shape of the embeddings
print(embeddings.shape)