In [2]:
import polars as pl


train_test_split = 0.8
# Load the CSV data
df = pl.read_csv("dream_train.csv")

# Calculate maximum sequence length
max_length = df.select(pl.col("sequence").str.len_bytes().max()).item()

print(max_length)

# Function to calculate padding
def pad_sequence(seq: str, max_len: int) -> str:
    pad_needed = max_len - len(seq)
    left_pad = (pad_needed + 1) // 2  # Prefer more padding on left if odd
    right_pad = pad_needed // 2
    return f"{'N' * left_pad}{seq}{'N' * right_pad}"

# Pad sequences using native Polars operations
df = df.with_columns(
    pl.col("sequence").map_elements(lambda s: pad_sequence(s, max_length)).alias("padded_sequence")
).select(
    pl.col("padded_sequence").alias("sequence"),
    pl.col("expression")
)

142




In [4]:
df[0]["sequence"].item()

'NNNNNNNNNNNNNNNNTGCATTTTTTTCACATCTCTTTGCCACGGGGTGAAGGATAGGATGGTATCCCCCCAGGCGAAGGACATCTGTGGGGATGGTTAGGTCAGGTGATATCGGTTACGGCTGTTNNNNNNNNNNNNNNNN'

In [7]:
from sklearn.model_selection import train_test_split

# Convert to numpy arrays for scikit-learn split
X = df.get_column("sequence").to_numpy()
y = df.get_column("expression").to_numpy()

# Perform train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42  # For reproducibility
)

# Rebuild Polars DataFrames
train_df = pl.DataFrame({
    "sequence": X_train,
    "expression": y_train
})

test_df = pl.DataFrame({
    "sequence": X_test,
    "expression": y_test
})

# Save results
train_df.write_csv("train.csv")
test_df.write_csv("test.csv")