In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("malloulifares/d2d-cytokine-data")

print("Path to dataset files:", path)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("noob786/mpeg-g-microbiomeclassificationconvertedfastqfiles")

print("Path to dataset files:", path)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("noob786/secondbatchoffastqfiles")

print("Path to dataset files:", path)

In [None]:
!apt-get update -y && apt-get install -y kmc
!pip install transformers torch pandas numpy tqdm qdrant-client

In [None]:
!pip uninstall -y triton

In [None]:
# [1] CRITICAL FIX: Uninstall Triton to avoid trans_b error
# Run this in a code cell BEFORE importing anything:
# !pip uninstall -y triton

# [2] IMPORTS
import torch
import subprocess
import pandas as pd
import numpy as np
import pickle
import uuid
import shutil
import os
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# [3] CONFIGURATION (Matches your provided paths)
class Config:
    TRAIN_CSV = "/kaggle/input/trainmpeg/Train.csv"
    SUBJECT_CSV = "/kaggle/input/trainmpeg/Train_Subjects.csv"
    CYTOKINE_CSV = "/kaggle/input/d2d-cytokine-data/cytokine_profiles.csv"

    # Where the raw genomic sequences live
    FASTQ_DIRS = [
        "/kaggle/input/mpeg-g-microbiomeclassificationconvertedfastqfiles/TrainFiles/TrainFiles",
        "/kaggle/input/secondbatchoffastqfiles/TrainFiles"
    ]

    # Models & params
    DENSE_MODEL_ID = "zhihan1996/DNABERT-2-117M"
    OUTPUT_FILENAME = "bio_memory_dump.pkl"
    KMER_SIZE = 6
    MAX_DNA_LEN = 512 # Truncate sequences for DNABERT efficiency

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚öôÔ∏è Hardware Acceleration: {device}")

# [4] DATA INGESTION LOGIC
def load_metadata():
    print("üìÇ Loading & Merging Metadata...")
    train = pd.read_csv(Config.TRAIN_CSV)
    subj = pd.read_csv(Config.SUBJECT_CSV)
    cyto = pd.read_csv(Config.CYTOKINE_CSV)

    # Merge all metadata into one view
    df = pd.merge(train, cyto, on="SampleID", how="inner")
    df = pd.merge(df, subj, on="SubjectID", how="left")

    # Helper to find actual file paths on Kaggle disk
    def get_path(fname):
        base = fname.replace('.mgb', '.fastq')
        for d in Config.FASTQ_DIRS:
            for ext in ['', '.gz']:
                p = os.path.join(d, base + ext)
                if os.path.exists(p): return p
        return None

    print("   Mapping FASTQ paths...")
    df['filepath'] = df['filename'].apply(get_path)
    df_clean = df.dropna(subset=['filepath']).reset_index(drop=True)
    print(f"‚úÖ Ready to process {len(df_clean)} samples.")
    return df_clean

# [5] VECTORIZATION ENGINES
print("üß† Loading DNABERT-2 (Genomic Foundation Model)...")
tokenizer = AutoTokenizer.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True)
model = AutoModel.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True).to(device)

def generate_dense_embedding(file_path):
    """
    Reads DNA sequence -> DNABERT-2 -> Semantic Vector (768d)
    """
    sequence_snippet = ""
    try:
        # Read file, skipping header, taking the first sequence line
        with open(file_path, 'r') as f:
            next(f)
            sequence_snippet = next(f).strip()[:Config.MAX_DNA_LEN]
    except:
        return np.zeros(768).tolist() # Fail safe

    inputs = tokenizer(sequence_snippet, return_tensors="pt", padding=True, truncation=True, max_length=Config.MAX_DNA_LEN).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling to represent whole sequence chunk
        embedding = torch.mean(outputs[0], dim=1).squeeze().cpu().numpy()

    return embedding.tolist()

def generate_sparse_embedding(file_path):
    """
    Reads DNA -> KMC Count -> Sparse Vector (Indices, Values)
    This captures exact microbial motifs.
    """
    tmp_uuid = str(uuid.uuid4())
    tmp_prefix = f"/tmp/{tmp_uuid}"

    # 1. Run KMC (Fast C++ k-mer counter)
    cmd = f"kmc -k{Config.KMER_SIZE} -ci1 -fm {file_path} {tmp_prefix} /tmp"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # 2. Dump KMC database to readable text
    subprocess.run(f"kmc_tools transform {tmp_prefix} dump {tmp_prefix}.txt",
                   shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    indices = []
    values = []

    # 3. Parse text to Sparse Format
    if os.path.exists(f"{tmp_prefix}.txt"):
        with open(f"{tmp_prefix}.txt", 'r') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2:
                    kmer_seq, count = parts[0], parts[1]
                    # Hash string k-mer to integer index for vector db
                    # This maps 'ATCGCG' -> 12345
                    idx = hash(kmer_seq) % 100000
                    indices.append(idx)
                    values.append(int(count))

        # Cleanup
        os.remove(f"{tmp_prefix}.txt")
        for ext in ['.kmc_pre', '.kmc_suf']:
            if os.path.exists(tmp_prefix + ext): os.remove(tmp_prefix + ext)

    return indices, values

# [6] EXECUTION PIPELINE
df = load_metadata()

# === OPTIONAL: Limit rows for testing speed ===
# Remove this line for full processing
df = df.head(50)

vectors_payload = []

print(f"üöÄ Starting Hybrid Vectorization on {len(df)} samples...")
for idx, row in tqdm(df.iterrows(), total=len(df)):

    # Generate Vectors
    dense = generate_dense_embedding(row['filepath'])
    sp_idx, sp_val = generate_sparse_embedding(row['filepath'])

    # Structure data for Qdrant
    item = {
        "id": idx,
        "vector": {
            "dense": dense,
            "sparse": {"indices": sp_idx, "values": sp_val}
        },
        "payload": {
            "SampleID": row['SampleID'],
            "BodySite": row.get('SampleType', 'Unknown'),
            # Metabolic Targets for Analysis
            "IL22": row.get('IL22', 0),
            "EGF": row.get('EGF', 0),
            "TNFA": row.get('TNFA', 0),
            "InsulinSensitivity": row.get('Insulin_Sensitivity_Label', 'Unknown')
        }
    }
    vectors_payload.append(item)

# [7] EXPORT
print(f"üíæ Saving Bio-Memory to {Config.OUTPUT_FILENAME}...")
with open(Config.OUTPUT_FILENAME, 'wb') as f:
    pickle.dump(vectors_payload, f)

print("‚úÖ Done! Check the 'Output' tab to download your .pkl file.")

‚öôÔ∏è Hardware Acceleration: cuda
üß† Loading DNABERT-2 (Genomic Foundation Model)...


2026-01-19 20:38:20.729718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768855100.751637     943 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768855100.758448     943 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768855100.775645     943 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768855100.775664     943 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768855100.775666     943 computation_placer.cc:177] computation placer alr

üìÇ Loading & Merging Metadata...
   Mapping FASTQ paths...
‚úÖ Ready to process 1982 samples.
üöÄ Starting Hybrid Vectorization on 50 samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:21<00:00,  2.31it/s]

üíæ Saving Bio-Memory to bio_memory_dump.pkl...
‚úÖ Done! Check the 'Output' tab to download your .pkl file.





In [None]:
# [1] CRITICAL FIX: Uninstall Triton to avoid trans_b error
# Run this in a code cell BEFORE importing anything:
# !pip uninstall -y triton

# [2] IMPORTS
import torch
import subprocess
import pandas as pd
import numpy as np
import pickle
import uuid
import shutil
import os
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# [3] CONFIGURATION (Matches your provided paths)
class Config:
    TRAIN_CSV = "/kaggle/input/trainmpeg/Train.csv"
    SUBJECT_CSV = "/kaggle/input/trainmpeg/Train_Subjects.csv"
    CYTOKINE_CSV = "/kaggle/input/d2d-cytokine-data/cytokine_profiles.csv"

    # Where the raw genomic sequences live
    FASTQ_DIRS = [
        "/kaggle/input/mpeg-g-microbiomeclassificationconvertedfastqfiles/TrainFiles/TrainFiles",
        "/kaggle/input/secondbatchoffastqfiles/TrainFiles"
    ]

    # Models & params
    DENSE_MODEL_ID = "zhihan1996/DNABERT-2-117M"
    OUTPUT_FILENAME = "bio_memory_dump.pkl"
    KMER_SIZE = 6
    MAX_DNA_LEN = 512 # Truncate sequences for DNABERT efficiency

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚öôÔ∏è Hardware Acceleration: {device}")

# [4] DATA INGESTION LOGIC
def load_metadata():
    print("üìÇ Loading & Merging Metadata...")
    train = pd.read_csv(Config.TRAIN_CSV)
    subj = pd.read_csv(Config.SUBJECT_CSV)
    cyto = pd.read_csv(Config.CYTOKINE_CSV)

    # Merge all metadata into one view
    df = pd.merge(train, cyto, on="SampleID", how="inner")
    df = pd.merge(df, subj, on="SubjectID", how="left")

    # Helper to find actual file paths on Kaggle disk
    def get_path(fname):
        base = fname.replace('.mgb', '.fastq')
        for d in Config.FASTQ_DIRS:
            for ext in ['', '.gz']:
                p = os.path.join(d, base + ext)
                if os.path.exists(p): return p
        return None

    print("   Mapping FASTQ paths...")
    df['filepath'] = df['filename'].apply(get_path)
    df_clean = df.dropna(subset=['filepath']).reset_index(drop=True)
    print(f"‚úÖ Ready to process {len(df_clean)} samples.")
    return df_clean

# [5] VECTORIZATION ENGINES
print("üß† Loading DNABERT-2 (Genomic Foundation Model)...")
tokenizer = AutoTokenizer.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True)
model = AutoModel.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True).to(device)

def generate_dense_embedding(file_path):
    """
    Reads DNA sequence -> DNABERT-2 -> Semantic Vector (768d)
    """
    sequence_snippet = ""
    try:
        # Read file, skipping header, taking the first sequence line
        with open(file_path, 'r') as f:
            next(f)
            sequence_snippet = next(f).strip()[:Config.MAX_DNA_LEN]
    except:
        return np.zeros(768).tolist() # Fail safe

    inputs = tokenizer(sequence_snippet, return_tensors="pt", padding=True, truncation=True, max_length=Config.MAX_DNA_LEN).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling to represent whole sequence chunk
        embedding = torch.mean(outputs[0], dim=1).squeeze().cpu().numpy()

    return embedding.tolist()

def generate_sparse_embedding(file_path):
    """
    Reads DNA -> KMC Count -> Sparse Vector (Indices, Values)
    This captures exact microbial motifs.
    """
    tmp_uuid = str(uuid.uuid4())
    tmp_prefix = f"/tmp/{tmp_uuid}"

    # 1. Run KMC (Fast C++ k-mer counter)
    cmd = f"kmc -k{Config.KMER_SIZE} -ci1 -fm {file_path} {tmp_prefix} /tmp"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # 2. Dump KMC database to readable text
    subprocess.run(f"kmc_tools transform {tmp_prefix} dump {tmp_prefix}.txt",
                   shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    indices = []
    values = []

    # 3. Parse text to Sparse Format
    if os.path.exists(f"{tmp_prefix}.txt"):
        with open(f"{tmp_prefix}.txt", 'r') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2:
                    kmer_seq, count = parts[0], parts[1]
                    # Hash string k-mer to integer index for vector db
                    # This maps 'ATCGCG' -> 12345
                    idx = hash(kmer_seq) % 100000
                    indices.append(idx)
                    values.append(int(count))

        # Cleanup
        os.remove(f"{tmp_prefix}.txt")
        for ext in ['.kmc_pre', '.kmc_suf']:
            if os.path.exists(tmp_prefix + ext): os.remove(tmp_prefix + ext)

    return indices, values

# [6] EXECUTION PIPELINE
df = load_metadata()

# Processing full dataset (all 1982 samples)
print(f"üìä Processing complete dataset: {len(df)} samples")

vectors_payload = []

print(f"üöÄ Starting Hybrid Vectorization on {len(df)} samples...")
for idx, row in tqdm(df.iterrows(), total=len(df)):

    # Generate Vectors
    dense = generate_dense_embedding(row['filepath'])
    sp_idx, sp_val = generate_sparse_embedding(row['filepath'])

    # Structure data for Qdrant
    item = {
        "id": idx,
        "vector": {
            "dense": dense,
            "sparse": {"indices": sp_idx, "values": sp_val}
        },
        "payload": {
            "SampleID": row['SampleID'],
            "BodySite": row.get('SampleType', 'Unknown'),
            # Metabolic Targets for Analysis
            "IL22": row.get('IL22', 0),
            "EGF": row.get('EGF', 0),
            "TNFA": row.get('TNFA', 0),
            "InsulinSensitivity": row.get('Insulin_Sensitivity_Label', 'Unknown')
        }
    }
    vectors_payload.append(item)

# [7] EXPORT
print(f"üíæ Saving Bio-Memory to {Config.OUTPUT_FILENAME}...")
with open(Config.OUTPUT_FILENAME, 'wb') as f:
    pickle.dump(vectors_payload, f)

print("‚úÖ Done! Check the 'Output' tab to download your .pkl file.")

‚öôÔ∏è Hardware Acceleration: cuda
üß† Loading DNABERT-2 (Genomic Foundation Model)...


Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üìÇ Loading & Merging Metadata...
   Mapping FASTQ paths...
‚úÖ Ready to process 1982 samples.
üìä Processing complete dataset: 1982 samples
üöÄ Starting Hybrid Vectorization on 1982 samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1982/1982 [16:45<00:00,  1.97it/s]

üíæ Saving Bio-Memory to bio_memory_dump.pkl...
‚úÖ Done! Check the 'Output' tab to download your .pkl file.





In [None]:
import pandas as pd
import os

# --- PATHS ---
# Adjust these if your input directory structure is different
TRAIN_CSV = "/kaggle/input/trainmpeg/Train.csv"
SUBJECT_CSV = "/kaggle/input/trainmpeg/Train_Subjects.csv"
CYTOKINE_CSV = "/kaggle/input/d2d-cytokine-data/cytokine_profiles.csv"

def inspect_dataframe(name, df, key_col):
    print(f"\n{'='*20} ANALYZING: {name} {'='*20}")
    print(f"Shape: {df.shape}")
    print(f"Key Column: '{key_col}'")

    # Check Key Type
    key_dtype = df[key_col].dtype
    print(f"Key Data Type: {key_dtype}")

    # Check for Whitespace issues if string
    if df[key_col].dtype == 'object':
        has_whitespace = df[key_col].str.contains(r'\s', regex=True).any()
        print(f"Contains Whitespace? {has_whitespace}")

        # Show example IDs
        print(f"Example IDs: {df[key_col].head(3).tolist()}")
    else:
        print(f"Example IDs: {df[key_col].head(3).tolist()}")

    # Check Uniqueness
    n_unique = df[key_col].nunique()
    print(f"Unique Keys: {n_unique} (Duplicates: {len(df) - n_unique})")

def verify_merge_compatibility():
    print("Loading CSVs...")
    try:
        df_train = pd.read_csv(TRAIN_CSV)
        df_subj = pd.read_csv(SUBJECT_CSV)
        df_cyto = pd.read_csv(CYTOKINE_CSV)
    except FileNotFoundError as e:
        print(f"‚ùå Error: File not found. Check your paths.\n{e}")
        return

    # 1. Inspect Individual Files
    inspect_dataframe("Train.csv (Samples)", df_train, "SampleID")
    inspect_dataframe("Cytokine.csv (Targets)", df_cyto, "SampleID")
    inspect_dataframe("Subjects.csv (Metadata)", df_subj, "SubjectID")

    print("\n" + "="*60)
    print("               MERGE COMPATIBILITY CHECK")
    print("="*60)

    # 2. Check SampleID Match (Train vs Cytokine)
    # They usually join on 'SampleID'
    train_samples = set(df_train['SampleID'].astype(str).str.strip())
    cyto_samples = set(df_cyto['SampleID'].astype(str).str.strip())

    common_samples = train_samples.intersection(cyto_samples)
    print(f"\n[1] SampleID Merge (Train <-> Cytokine)")
    print(f"   - Train Samples: {len(train_samples)}")
    print(f"   - Cytokine Samples: {len(cyto_samples)}")
    print(f"   - ‚úÖ Common Samples: {len(common_samples)}")

    if len(common_samples) == 0:
        print("   ‚ö†Ô∏è CRITICAL WARNING: No common SampleIDs found. Check ID formats!")

    # 3. Check SubjectID Match (Train vs Subjects)
    # They join on 'SubjectID'
    # This is the most common failure point (Int vs String)
    print(f"\n[2] SubjectID Merge (Train <-> Subjects)")

    # Raw Types
    type_train = df_train['SubjectID'].dtype
    type_subj = df_subj['SubjectID'].dtype
    print(f"   - Train 'SubjectID' Type: {type_train}")
    print(f"   - Subj 'SubjectID' Type: {type_subj}")

    if type_train != type_subj:
        print("   ‚ö†Ô∏è WARNING: Type Mismatch detected! One is Int, one is Object/String.")
        print("   -> The pipeline MUST convert both to strings before merging.")

    # Test Overlap (converting to string to simulate the fix)
    train_subs = set(df_train['SubjectID'].astype(str).str.strip())
    subj_subs = set(df_subj['SubjectID'].astype(str).str.strip())

    common_subs = train_subs.intersection(subj_subs)
    print(f"   - Train Subjects: {len(train_subs)}")
    print(f"   - Metadata Subjects: {len(subj_subs)}")
    print(f"   - ‚úÖ Common Subjects (after string conversion): {len(common_subs)}")

    if len(common_subs) == 0:
        print("   ‚ùå ERROR: Even after string conversion, no common Subjects found. Data IDs are fundamentally different.")
    elif len(common_subs) < len(train_subs):
        print(f"   ‚ÑπÔ∏è Note: {len(train_subs) - len(common_subs)} samples in Train do not have matching Subject Metadata.")

verify_merge_compatibility()

Loading CSVs...

Shape: (2901, 4)
Key Column: 'SampleID'
Key Data Type: object
Contains Whitespace? False
Example IDs: ['Sample_AFTIWE', 'Sample_JQJVNK', 'Sample_YJWGWW']
Unique Keys: 1262 (Duplicates: 1639)

Shape: (670, 73)
Key Column: 'SampleID'
Key Data Type: object
Contains Whitespace? False
Example IDs: ['Sample_BDRJDQ', 'Sample_ESYUZA', 'Sample_CNKYCP']
Unique Keys: 670 (Duplicates: 0)

Shape: (66, 17)
Key Column: 'SubjectID'
Key Data Type: object
Contains Whitespace? False
Example IDs: ['Subject_UDAXIH', 'Subject_NHOSIZ', 'Subject_AYZFWN']
Unique Keys: 66 (Duplicates: 0)

               MERGE COMPATIBILITY CHECK

[1] SampleID Merge (Train <-> Cytokine)
   - Train Samples: 1262
   - Cytokine Samples: 670
   - ‚úÖ Common Samples: 670

[2] SubjectID Merge (Train <-> Subjects)
   - Train 'SubjectID' Type: object
   - Subj 'SubjectID' Type: object
   - Train Subjects: 66
   - Metadata Subjects: 66
   - ‚úÖ Common Subjects (after string conversion): 66


In [None]:

# [2] IMPORTS
import torch
import subprocess
import pandas as pd
import numpy as np
import pickle
import uuid
import shutil
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# [3] CONFIGURATION
class Config:
    # Input Paths
    TRAIN_CSV = "/kaggle/input/trainmpeg/Train.csv"
    SUBJECT_CSV = "/kaggle/input/trainmpeg/Train_Subjects.csv"
    CYTOKINE_CSV = "/kaggle/input/d2d-cytokine-data/cytokine_profiles.csv"

    # FASTQ File Locations
    FASTQ_DIRS = [
        "/kaggle/input/mpeg-g-microbiomeclassificationconvertedfastqfiles/TrainFiles/TrainFiles",
        "/kaggle/input/secondbatchoffastqfiles/TrainFiles"
    ]

    # Model Settings
    DENSE_MODEL_ID = "zhihan1996/DNABERT-2-117M"
    OUTPUT_FILENAME = "bio_memory_dump.pkl"
    KMER_SIZE = 6
    MAX_DNA_LEN = 512 # Truncate sequences for efficiency

# Detect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚öôÔ∏è Hardware Acceleration: {device}")

# [4] DATA INGESTION
def load_metadata():
    print("üìÇ Loading Metadata...")
    train = pd.read_csv(Config.TRAIN_CSV)
    subj = pd.read_csv(Config.SUBJECT_CSV)
    cyto = pd.read_csv(Config.CYTOKINE_CSV)

    # --- METADATA CLEANING FIX ---
    # Ensure IDs match perfectly (trim whitespace and force string)
    print("   üßπ Normalizing IDs...")
    train['SampleID'] = train['SampleID'].astype(str).str.strip()
    cyto['SampleID'] = cyto['SampleID'].astype(str).str.strip()

    train['SubjectID'] = train['SubjectID'].astype(str).str.strip()
    subj['SubjectID'] = subj['SubjectID'].astype(str).str.strip()

    # Merge: Sample -> Cytokine -> Subject
    df = pd.merge(train, cyto, on="SampleID", how="inner")
    df = pd.merge(df, subj, on="SubjectID", how="left")

    # Locate Files
    def get_path(fname):
        base = str(fname).replace('.mgb', '.fastq')
        for d in Config.FASTQ_DIRS:
            for ext in ['', '.gz']:
                p = os.path.join(d, base + ext)
                if os.path.exists(p): return p
        return None

    print("   Mapping file paths...")
    df['filepath'] = df['filename'].apply(get_path)
    df_clean = df.dropna(subset=['filepath']).reset_index(drop=True)

    print(f"‚úÖ Loaded {len(df_clean)} samples with valid metadata & files.")
    return df_clean

# [5] VECTORIZATION ENGINES
print("üß† Loading Genomic Foundation Model (DNABERT-2)...")
tokenizer = AutoTokenizer.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True)
model = AutoModel.from_pretrained(Config.DENSE_MODEL_ID, trust_remote_code=True).to(device)

def generate_dense_embedding(file_path):
    """Semantic DNA embedding via Transformer"""
    seq = ""
    try:
        with open(file_path, 'r') as f:
            next(f) # Skip header
            seq = next(f).strip()[:Config.MAX_DNA_LEN]
    except:
        return np.zeros(768).tolist()

    # Fixed truncation to prevent shape errors
    inputs = tokenizer(seq, return_tensors="pt", padding=True, truncation=True, max_length=Config.MAX_DNA_LEN).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = torch.mean(outputs[0], dim=1).squeeze().cpu().numpy()
    return embedding.tolist()

def generate_sparse_embedding(file_path):
    """K-mer Counting (Microbial Signature)"""
    tmp_prefix = f"/tmp/{uuid.uuid4()}"
    subprocess.run(f"kmc -k{Config.KMER_SIZE} -ci1 -fm {file_path} {tmp_prefix} /tmp", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(f"kmc_tools transform {tmp_prefix} dump {tmp_prefix}.txt", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    indices, values = [], []
    if os.path.exists(f"{tmp_prefix}.txt"):
        with open(f"{tmp_prefix}.txt", 'r') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2:
                    indices.append(hash(parts[0]) % 100000)
                    values.append(int(parts[1]))
        os.remove(f"{tmp_prefix}.txt")
        for ext in ['.kmc_pre', '.kmc_suf']:
            if os.path.exists(tmp_prefix + ext): os.remove(tmp_prefix + ext)
    return indices, values

# [6] EXECUTION PIPELINE
df = load_metadata()
vectors_payload = []

print(f"üöÄ Processing FULL Dataset ({len(df)} samples)...")

# Processing Loop
for idx, row in tqdm(df.iterrows(), total=len(df)):
    # 1. Generate Vectors
    dense = generate_dense_embedding(row['filepath'])
    sp_ind, sp_val = generate_sparse_embedding(row['filepath'])

    # 2. Structure Data
    item = {
        "id": idx,
        "vector": {
            "dense": dense,
            "sparse": {"indices": sp_ind, "values": sp_val}
        },
        "payload": {
            "SampleID": row['SampleID'],
            "BodySite": row.get('SampleType', 'Unknown'),
            # Fixed Key Name for Dashboard Compatibility
            "InsulinSensitivity": row.get('Insulin_Sensitivity_Label', 'Unknown'),
            # Cytokines
            "IL22": row.get('IL22', 0),
            "EGF": row.get('EGF', 0),
            "TNFA": row.get('TNFA', 0)
        }
    }
    vectors_payload.append(item)

# [7] EXPORT
print(f"üíæ Saving Bio-Memory to {Config.OUTPUT_FILENAME}...")
with open(Config.OUTPUT_FILENAME, 'wb') as f:
    pickle.dump(vectors_payload, f)

print(f"‚úÖ Success! Go to the 'Output' tab to download '{Config.OUTPUT_FILENAME}'.")

‚öôÔ∏è Hardware Acceleration: cuda
üß† Loading Genomic Foundation Model (DNABERT-2)...


Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üìÇ Loading Metadata...
   üßπ Normalizing IDs...
   Mapping file paths...
‚úÖ Loaded 1982 samples with valid metadata & files.
üöÄ Processing FULL Dataset (1982 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1982/1982 [16:51<00:00,  1.96it/s]

üíæ Saving Bio-Memory to bio_memory_dump.pkl...
‚úÖ Success! Go to the 'Output' tab to download 'bio_memory_dump.pkl'.



