## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.insert(0, '..')

from src.utils.data_loader import load_dataset, validate_sequences
from src.features.sequence_features import nucleotide_composition, dinucleotide_features

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load and Explore Promoter Sequence Data

In [None]:
# Load sample data (you'll replace this with your actual data)
# Expected format: CSV with 'sequence' and 'expression' columns
# df = load_dataset('../data/raw/promoters.csv')

# Create sample data for demonstration
np.random.seed(42)
n_samples = 100

sequences = [''.join(np.random.choice(['A', 'C', 'G', 'T'], 100)) for _ in range(n_samples)]
expressions = np.random.uniform(0, 10, n_samples)

df = pd.DataFrame({
    'sequence': sequences,
    'expression': expressions
})

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few samples:")
print(df.head())

In [None]:
# Basic statistics
print("\nExpression Level Statistics:")
print(df['expression'].describe())

print("\nSequence Length Statistics:")
seq_lengths = df['sequence'].str.len()
print(seq_lengths.describe())

In [None]:
# Validate sequences
valid_seqs, invalid_idx = validate_sequences(df['sequence'].values)
print(f"Valid sequences: {len(valid_seqs)}")
print(f"Invalid sequences: {len(invalid_idx)}")

if invalid_idx:
    print(f"Invalid sequence indices: {invalid_idx}")

## 3. Sequence Composition Analysis

In [None]:
# Extract nucleotide composition features
composition_features = [nucleotide_composition(seq) for seq in df['sequence']]
composition_df = pd.DataFrame(composition_features)

print("Nucleotide Composition Statistics:")
print(composition_df.describe())

In [None]:
# Visualize nucleotide frequencies
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for i, col in enumerate(['A_freq', 'C_freq', 'G_freq', 'T_freq']):
    ax = axes[i // 2, i % 2]
    ax.hist(composition_df[col], bins=30, edgecolor='black', alpha=0.7)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# GC content distribution
plt.figure(figsize=(10, 6))
plt.hist(composition_df['GC_content'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('GC Content')
plt.ylabel('Frequency')
plt.title('Distribution of GC Content in Promoter Sequences')
plt.show()

print(f"Mean GC Content: {composition_df['GC_content'].mean():.3f}")
print(f"Std GC Content: {composition_df['GC_content'].std():.3f}")

## 4. Gene Expression Analysis

In [None]:
# Expression distribution
plt.figure(figsize=(10, 6))
plt.hist(df['expression'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Expression Level')
plt.ylabel('Frequency')
plt.title('Distribution of Gene Expression Levels')
plt.show()

## 5. Correlation Analysis

In [None]:
# Combine features with expression
features_with_expression = composition_df.copy()
features_with_expression['expression'] = df['expression'].values

# Calculate correlations
correlations = features_with_expression.corr()['expression'].drop('expression').sort_values()
print("Correlation with Gene Expression:")
print(correlations)

In [None]:
# Visualize correlations
plt.figure(figsize=(10, 6))
correlations.plot(kind='barh')
plt.xlabel('Correlation Coefficient')
plt.title('Correlation of Sequence Features with Gene Expression')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots for top features
top_features = correlations.abs().nlargest(4).index

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for i, feature in enumerate(top_features):
    ax = axes[i // 2, i % 2]
    ax.scatter(features_with_expression[feature], df['expression'], alpha=0.6)
    ax.set_xlabel(feature)
    ax.set_ylabel('Expression Level')
    corr = correlations[feature]
    ax.set_title(f'{feature} vs Expression (r={corr:.3f})')

plt.tight_layout()
plt.show()