First we will import all the necessary libraries

In [8]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Biopython for Fasta file processing
from Bio import SeqIO

#Tensorflow and Keras for deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, LSTM, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Scikit-learn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Utility Libraries
import os
import random

In [10]:
from Bio import SeqIO
from collections import defaultdict

def kmer_count(sequence, k):
    """Counts k-mers in a given DNA sequence."""
    kmer_counts = defaultdict(int)
    # Iterate over the sequence to extract k-mers
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmer_counts[kmer] += 1
    return kmer_counts

def encode_top_kmers(fasta_file, k, top_n=10):
    """Encodes only the top N sequences from a FASTA file into k-mer counts."""
    kmer_features = []
    sequence_ids = []
    
    # Parse the FASTA file and process only the top N sequences
    for i, seq_record in enumerate(SeqIO.parse(fasta_file, "fasta")):
        if i >= top_n:  # Process only the top N sequences
            break
        sequence = str(seq_record.seq)
        kmer_counts = kmer_count(sequence, k)
        kmer_features.append(kmer_counts)
        sequence_ids.append(seq_record.id)
    
    return sequence_ids, kmer_features

# Usage example
fasta_file = "Human.fasta"  # Replace with the actual path to your FASTA file
k = 4
top_n = 10
sequence_ids, kmer_features = encode_top_kmers(fasta_file, k, top_n)

# Print the results
for seq_id, counts in zip(sequence_ids, kmer_features):
    print(f"Sequence ID: {seq_id}")
    print("K-mer Counts:", dict(counts))  # Convert defaultdict to dict for cleaner printing


# NUCLEOTIDES = ['A', 'C', 'G', 'T']

# def clean_sequence(seq):
#     return ''.join([nuc for nuc in seq if nuc in NUCLEOTIDES])



Sequence ID: sp|A0A075B706|TRDJ1_HUMAN
K-mer Counts: {'TDKL': 1, 'DKLI': 1, 'KLIF': 1, 'LIFG': 1, 'IFGK': 1, 'FGKG': 1, 'GKGT': 1, 'KGTR': 1, 'GTRV': 1, 'TRVT': 1, 'RVTV': 1, 'VTVE': 1, 'TVEP': 1}
Sequence ID: sp|A0A0G2JS06|LV539_HUMAN
K-mer Counts: {'MAWT': 1, 'AWTP': 1, 'WTPL': 1, 'TPLL': 1, 'PLLL': 1, 'LLLL': 3, 'LLLS': 1, 'LLSH': 1, 'LSHC': 1, 'SHCT': 1, 'HCTG': 1, 'CTGS': 1, 'TGSL': 1, 'GSLS': 1, 'SLSQ': 1, 'LSQP': 1, 'SQPV': 1, 'QPVL': 1, 'PVLT': 1, 'VLTQ': 1, 'LTQP': 1, 'TQPT': 1, 'QPTS': 1, 'PTSL': 1, 'TSLS': 1, 'SLSA': 1, 'LSAS': 1, 'SASP': 1, 'ASPG': 1, 'SPGA': 1, 'PGAS': 1, 'GASA': 1, 'ASAR': 1, 'SARF': 1, 'ARFT': 1, 'RFTC': 1, 'FTCT': 1, 'TCTL': 1, 'CTLR': 1, 'TLRS': 1, 'LRSG': 1, 'RSGI': 1, 'SGIN': 1, 'GINV': 1, 'INVG': 1, 'NVGT': 1, 'VGTY': 1, 'GTYR': 1, 'TYRI': 1, 'YRIY': 1, 'RIYW': 1, 'IYWY': 1, 'YWYQ': 1, 'WYQQ': 1, 'YQQK': 1, 'QQKP': 1, 'QKPG': 1, 'KPGS': 1, 'PGSL': 1, 'GSLP': 1, 'SLPR': 1, 'LPRY': 1, 'PRYL': 1, 'RYLL': 1, 'YLLR': 1, 'LLRY': 1, 'LRYK': 1, 'RYKS': 1, '