# DNA Base Sequence Content Analysis

This notebook demonstrates how to analyze and visualize the distribution of nucleotide bases in DNA sequence data using Polars and matplotlib.

## 0. Setup Environment

Before we begin our analysis, we should clean up any temporary files created by DataFusion/Apache Arrow from previous runs. These temporary catalogs can accumulate over time as they are created each time we run queries.

In [None]:
import os
import shutil

def cleanTMP():
    tmp_path = os.path.join(os.getcwd(), 'tmp')
    if os.path.exists(tmp_path):
        print(f"Usuwanie folderu tymczasowego: {tmp_path}")
        shutil.rmtree(tmp_path, ignore_errors=True)
        print("Folder tymczasowy usunięty.")
    else:
        print("Folder tymczasowy nie istnieje - brak potrzeby czyszczenia.")

cleanTMP()

## 1. Create Sample DNA Sequence Data

Let's import the necessary libraries for our analysis.

In [None]:
import polars as pl
import matplotlib.pyplot as plt
from polars_bio.quality_control_op import base_sequence_content
from polars_bio.quality_control_viz import plot_base_content

# Set matplotlib style for better visualizations
plt.style.use('ggplot')
%matplotlib inline

Now we'll create a simple example dataset with DNA sequences.

In [None]:
short_sequences = pl.DataFrame({
    "sequence": ["ATGC", "AAGC", "ATTC", "GTCC"]
})

## 2. Analyze Base Sequence Content

Now we'll use the `base_sequence_content` function to analyze the distribution of bases at each position.

In [None]:
result = base_sequence_content(short_sequences)
print(result)

## 3. Visualize Base Distribution

Let's visualize the distribution of bases at each position using our custom plotting function.

In [None]:
plot_base_content(result, figsize=(10, 6))

## 4. Creating More Realistic Data

Generate a more realistic dataset with longer sequences to better visualize base content distribution.

In [None]:
import random

def generate_dna(length, n_freq=0.05):
    bases = ['A', 'C', 'G', 'T']
    sequence = []
    for _ in range(length):
        if random.random() < n_freq:
            sequence.append('N')
        else:
            sequence.append(random.choice(bases))
    return ''.join(sequence)

random.seed(42)
num_sequences = 100
seq_length = 100

sequences = [generate_dna(seq_length) for _ in range(num_sequences)]
df_sequences = pl.DataFrame({"sequence": sequences})

df_sequences.head()

Analyze base content on our larger dataset

In [None]:
result_large = base_sequence_content(df_sequences)
result_large.head()

Plot the base content distribution for our larger dataset

In [None]:
plot_base_content(
  result_large,
  figsize=(12, 7),
  title='Base Distribution Across Sequence Positions'
)

# 5. Processing Real FASTQ Data

Extract sequences from the FASTQ file

In [None]:
fastq_path = "./example.fastq"

def extract_sequences_from_fastq(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 4):
            if i+1 < len(lines):
                sequence = lines[i+1].strip()
                sequences.append(sequence)
    return sequences

sequences = extract_sequences_from_fastq(fastq_path)

fastq_df = pl.DataFrame({"sequence": sequences})

print(f"Loaded {len(fastq_df)} sequences from FASTQ file")
print("\nFirst 5 sequences:")
print(fastq_df.head(5))

Process the sequences with base content analysis function

In [None]:
fastq_results = base_sequence_content(fastq_df)

print("Base content analysis results (first 10 positions):")
print(fastq_results.head(10))

max_a = fastq_results["a_count"].max()
max_t = fastq_results["t_count"].max()
max_g = fastq_results["g_count"].max()
max_c = fastq_results["c_count"].max()
max_n = fastq_results["n_count"].max() if "n_count" in fastq_results.columns else 0

print(f"\nMaximum counts - A: {max_a}, T: {max_t}, G: {max_g}, C: {max_c}, N: {max_n}")

Visualize Base Distribution in FASTQ Data

In [None]:
plot_base_content(
    fastq_results, 
    figsize=(14, 8), 
    title='Base Distribution in FASTQ Sequences'
)

# 6. Clean up tmp files

Clean up temporary files created during the analysis

In [None]:
cleanTMP()