In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.84-cp310-cp310-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
    --------------------------------------- 0.0/2.8 MB 495.5 kB/s eta 0:00:06
   - -------------------------------------- 0.1/2.8 MB 657.6 kB/s eta 0:00:05
   - -------------------------------------- 0.1/2.8 MB 656.4 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/2.8 MB 655.8 kB/s eta 0:00:05
   --- ------------------------------------ 0.2/2.8 MB 942.1 kB/s eta 0:00:03
   ----- ---------------------------------- 0.4/2.8 MB 1.3 MB/s eta 0:00:02
   -------- ------------------------------- 0.6/2.8 MB 1.7 MB/s eta 0:00:02
   ---------- ----------------------------- 0.7/2.8 MB 1.9 MB/s eta 0:00:02
   ------------- -------------------------- 0.9/2.8 MB 2.1 MB/s eta 0:00:01
   ----------


[notice] A new release of pip is available: 23.3.1 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from Bio import SeqIO
fastq_file = "16S_WT_day3_11_SRR2628505_1.fastq"
with open(fastq_file, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        print(f"ID: {record.id}")
        print(f"Sequence: {record.seq}")
        print(f"Quality: {record.letter_annotations['phred_quality']}")
        if i >= 10:
            break

ID: SRR2628505.45838
Sequence: CCTACGGGGGGCAGCAGTGAGGAATATTGGTCAATGGACGAGAGTCTGAACCAGCCAAGTAGCGTGAAGGATGACTGCCCTATGGGTTGTAAACTTCTTTTATATGGGAATAAAACAGGGTATGCATACCCTCTTGTATGTACCATATGAATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGA
Quality: [37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37

In [5]:
def extract_kmers(sequence, k):
    kmers = {}
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        predecessor = sequence[i - 1] if i > 0 else None
        successor = sequence[i + k] if i + k < len(sequence) else None
        
        if kmer not in kmers:
            kmers[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
        
        kmers[kmer]['count'] += 1
        
        if predecessor:
            kmers[kmer]['predecessors'].add(predecessor)
        
        if successor:
            kmers[kmer]['successors'].add(successor)
    
    return kmers

In [8]:
def parse_dataset_and_get_kmers(filepath, k):
    kmer_frequencies = {}
    with open(filepath, 'r') as file:
        while True:
            header = file.readline().strip()
            if not header:
                break
            sequence = file.readline().strip()
            plus = file.readline().strip()
            quality = file.readline().strip()
            
            kmers = extract_kmers(sequence, k)
            for kmer, data in kmers.items():
                if kmer not in kmer_frequencies:
                    kmer_frequencies[kmer] = {'count': 0, 'predecessors': set(), 'successors': set()}
                
                kmer_frequencies[kmer]['count'] += data['count']
                kmer_frequencies[kmer]['predecessors'].update(data['predecessors'])
                kmer_frequencies[kmer]['successors'].update(data['successors'])
    
    return kmer_frequencies

k = 21
kmer_frequencies = parse_dataset_and_get_kmers(fastq_file, k)

for kmer, data in kmer_frequencies.items():
    predecessors = ', '.join(data['predecessors']) if data['predecessors'] else 'None'
    successors = ', '.join(data['successors']) if data['successors'] else 'None'
    print(f"{kmer}: {data['count']} (Predecessors: {predecessors}, Successors: {successors})")

CCTACGGGGGGCAGCAGTGAG: 1841 (Predecessors: A, Successors: G, T)
CTACGGGGGGCAGCAGTGAGG: 1843 (Predecessors: C, A, Successors: G, C, A)
TACGGGGGGCAGCAGTGAGGA: 1842 (Predecessors: C, A, Successors: A)
ACGGGGGGCAGCAGTGAGGAA: 1842 (Predecessors: T, Successors: G, T)
CGGGGGGCAGCAGTGAGGAAT: 1843 (Predecessors: G, A, Successors: T, A)
GGGGGGCAGCAGTGAGGAATA: 1843 (Predecessors: G, T, C, Successors: G, T)
GGGGGCAGCAGTGAGGAATAT: 1850 (Predecessors: G, C, T, A, Successors: G, T)
GGGGCAGCAGTGAGGAATATT: 1850 (Predecessors: G, C, Successors: G, T)
GGGCAGCAGTGAGGAATATTG: 1858 (Predecessors: G, T, A, Successors: G, C)
GGCAGCAGTGAGGAATATTGG: 5585 (Predecessors: G, C, T, A, Successors: G, C, T, A)
GCAGCAGTGAGGAATATTGGT: 5569 (Predecessors: G, C, T, A, Successors: G, C)
CAGCAGTGAGGAATATTGGTC: 5576 (Predecessors: G, T, A, Successors: G, T, A)
AGCAGTGAGGAATATTGGTCA: 5578 (Predecessors: T, C, Successors: G, A)
GCAGTGAGGAATATTGGTCAA: 6924 (Predecessors: G, C, T, A, Successors: G, T, C)
CAGTGAGGAATATTGGTCAAT: 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

