## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '..')

from src.features.sequence_features import (
    one_hot_encode, extract_kmers, nucleotide_composition, dinucleotide_features
)
from src.utils.data_loader import save_processed_data

## 2. Generate Sample Data

In [None]:
# Create sample data
np.random.seed(42)
n_samples = 100

sequences = [''.join(np.random.choice(['A', 'C', 'G', 'T'], 100)) for _ in range(n_samples)]
expressions = np.random.uniform(0, 10, n_samples)

print(f"Number of samples: {n_samples}")
print(f"Example sequence: {sequences[0][:50]}...")

## 3. Feature Engineering Methods

In [None]:
# Method 1: Nucleotide Composition (5 features)
comp_features = [nucleotide_composition(seq) for seq in sequences]
comp_df = pd.DataFrame(comp_features)

print("Nucleotide Composition Features:")
print(comp_df.head())
print(f"Shape: {comp_df.shape}")

In [None]:
# Method 2: Dinucleotide Features (16 features)
dinuc_features = [dinucleotide_features(seq) for seq in sequences]
dinuc_df = pd.DataFrame(dinuc_features)

print("Dinucleotide Features:")
print(dinuc_df.head())
print(f"Shape: {dinuc_df.shape}")

In [None]:
# Method 3: K-mer Features (3-mers)
def extract_kmer_features(sequence, k=3):
    """Extract k-mer frequency features."""
    kmer_counter = extract_kmers(sequence, k)
    # Get all possible k-mers
    alphabet = 'ACGT'
    possible_kmers = [''.join(p) for p in __import__('itertools').product(alphabet, repeat=k)]
    
    features = {}
    total = sum(kmer_counter.values())
    for kmer in possible_kmers:
        features[f'kmer_{kmer}'] = kmer_counter[kmer] / total if total > 0 else 0
    return features

kmer_features = [extract_kmer_features(seq, k=3) for seq in sequences]
kmer_df = pd.DataFrame(kmer_features)

print("K-mer (3-mer) Features:")
print(kmer_df.head())
print(f"Shape: {kmer_df.shape}")

In [None]:
# Method 4: One-hot Encoding (flatten sequence)
def one_hot_flatten(sequence):
    """One-hot encode and flatten."""
    encoded = one_hot_encode(sequence)
    return encoded.flatten()

onehot_features = np.array([one_hot_flatten(seq) for seq in sequences])
print(f"One-hot Encoded Features Shape: {onehot_features.shape}")
print(f"(Each sequence is flattened to {onehot_features.shape[1]} features)")

## 4. Combine Features

In [None]:
# Combine composition and dinucleotide features
combined_features = pd.concat([comp_df, dinuc_df], axis=1)

print(f"Combined Features Shape: {combined_features.shape}")
print(f"Feature names: {list(combined_features.columns)}")

## 5. Save Processed Features

In [None]:
# Save combined features
X = combined_features.values
y = expressions

save_processed_data(X, y, 'sample_dataset')
print(f"Saved processed features with shape {X.shape}")
print(f"Target shape: {y.shape}")

## 6. Feature Statistics

In [None]:
# Statistics of combined features
print("Combined Features Statistics:")
print(combined_features.describe())

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(3, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(combined_features.columns[:6]):
    axes[i].hist(combined_features[col], bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()