In [3]:
from collections import defaultdict
from tqdm import tqdm
import itertools

class KmerAnalyzer:
    def __init__(self):
        self.amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
        
    def generate_all_possible_kmers(self, k):
        return [''.join(p) for p in itertools.product(self.amino_acids, repeat=k)]
    
    def analyze_kmers(self, sequences, k):
        kmer_counts = defaultdict(int)
        
        # Count k-mers
        for seq in sequences:
            for i in range(len(seq) - k + 1):
                kmer = seq[i:i+k]
                kmer_counts[kmer] += 1
                
        return kmer_counts
    
    def calculate_statistics(self, kmer_counts, k):
        # Total counts
        total_counts = sum(kmer_counts.values())
        
        # Get all possible k-mers
        all_possible_kmers = self.generate_all_possible_kmers(k)
        total_possible_kmers = len(all_possible_kmers)
        
        # Expected value (assuming uniform distribution)
        expected_value = total_counts // total_possible_kmers
        
        # Prepare results
        results = []
        for kmer in all_possible_kmers:
            count = kmer_counts.get(kmer, 0)
            percentage = (count / total_counts * 100) if total_counts > 0 else 0
            results.append({
                'kmer': kmer,
                'count': count,
                'percentage': percentage,
                'expected': expected_value,
                'fold_difference': (count / expected_value) if expected_value > 0 else 0
            })
        
        # Sort by count in descending order
        results.sort(key=lambda x: x['count'], reverse=True)
        
        return results, total_counts

In [4]:
import pandas as pd
from tqdm import tqdm
import sys, os, math

sys.path.insert(0, '../dlp')
batch_size = 1_000_000
dataset_name = "corpus"
from data_access import PQDataAccess
da = PQDataAccess(f"/home/aac/Alireza/datasets/export_pqt_4_taxseq_new/{dataset_name}", batch_size)

k = 1  # Can be modified for different k-mer sizes
sequences = [b['Sequence'] for b in da.get_batch()]  # Your data loading

def show_k_mers(sequences, k):
    analyzer = KmerAnalyzer()
    # Count k-mers
    print(f"\nAnalyzing {k}-mers...")
    kmer_counts = analyzer.analyze_kmers(sequences, k)
    
    # Calculate statistics
    results, total_counts = analyzer.calculate_statistics(kmer_counts, k)

    results = results[:10]
    
    # Print results
    print(f"\nTotal {k}-mers processed: {total_counts}")
    print(f"Number of unique {k}-mers found: {len(kmer_counts)}")
    print(f"Number of possible {k}-mers: {len(analyzer.generate_all_possible_kmers(k))}")
    
    print(f"\nK-mer analysis results:")
    print(f"{'#':<4} {'K-mer':<8} {'Count':>12} {'Percentage':>12} {'Expected':>12} {'Fold Diff':>12}")
    print("-" * 64)
    for i, result in enumerate(results, 1):
        print(f"{i:<4} {result['kmer']:<8} {result['count']:>12,d} {result['percentage']:>11.2f}% "
              f"{result['expected']:>12,d} {result['fold_difference']:>12,.2f}")
    print("-" * 64)
    
    # Calculate totals
    total_count = sum(r['count'] for r in results)
    total_expected = sum(r['expected'] for r in results)
    total_percentage = sum(r['percentage'] for r in results)
    # Note: Fold difference average might be more meaningful than sum
    avg_fold_diff = sum(r['fold_difference'] for r in results) / len(results)
    
    print(f"{'Sum':<4} {'Total':<8} {total_count:>12,d} {total_percentage:>11.2f}% "
          f"{total_expected:>12,d} {avg_fold_diff:>12,.2f}")

 WORLD_SIZE=1 , LOCAL_WORLD_SIZE=1,RANK =0,LOCAL_RANK = 0 


In [23]:
for k in range(5):
    show_k_mers(sequences, k)


Analyzing 0-mers...

Total 0-mers processed: 351545107
Number of unique 0-mers found: 1
Number of possible 0-mers: 1

K-mer analysis results:
#    K-mer           Count   Percentage     Expected    Fold Diff
----------------------------------------------------------------
1              351,545,107      100.00%  351,545,107         1.00
----------------------------------------------------------------
Sum  Total     351,545,107      100.00%  351,545,107         1.00

Analyzing 1-mers...

Total 1-mers processed: 350545107
Number of unique 1-mers found: 24
Number of possible 1-mers: 20

K-mer analysis results:
#    K-mer           Count   Percentage     Expected    Fold Diff
----------------------------------------------------------------
1    L          34,645,079        9.88%   17,527,255         1.98
2    A          31,752,766        9.06%   17,527,255         1.81
3    G          25,516,597        7.28%   17,527,255         1.46
4    V          24,092,651        6.87%   17,527,255   

In [None]:
for k in range(7, 10):
    show_k_mers(sequences, k)


Analyzing 7-mers...
