In [1]:
%pip install -r requirements.txt




    fonttools>=3.0<4.0
             ~~~~~^


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load DNABERT-2 model
model_name = "zhihan1996/DNABERT-2-117M"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
def read_fastq(file_path):
    sequences = []
    with open(file_path, "r") as file:
        while True:
            identifier = file.readline().strip()  # Read identifier (@...)
            sequence = file.readline().strip()    # Read DNA sequence
            plus = file.readline().strip()        # Read separator (+)
            quality = file.readline().strip()     # Read quality scores
            
            if not sequence:  # End of file
                break
            
            sequences.append(sequence)  # Store DNA sequence
    
    return sequences

# Example usage
fastq_file = "ERR13259920.fastq"  # Replace with your file
dna_sequences = read_fastq(fastq_file)

# Print first 5 sequences
print(dna_sequences[:5])


['TGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTA', 'CTCCTCCTGCCCTTCCTCATGAACCCCAGTCGCGCCTCTGCCTTCTCATCCCTGC', 'ACTGTGCCCTAGACCTGCTCCCCTAGGCACTACAGTGGGGCCCTTGGTTGCAA', 'GAGGCAGGGAAGGCGGACAAAACTGGGAGAGGGAGAGAGTGTT', 'CACTAACTCAGGGAAGGCTTCCCTAACTGCCTCCC']


In [35]:
import random
def count_positional_matches(list1, list2):
    return sum(1 for x, y in zip(list1, list2) if x == y)
def mask_dna_sequences(sequences, min_len=3, max_len=6):
    masked_sequences = []
    removed_subsequences = []

    for seq in sequences:
        if len(seq) < min_len:
            # Skip sequences that are too short
            masked_sequences.append(seq)
            removed_subsequences.append(None)
            continue
        
        # Choose a random subsequence length
        sub_len = random.randint(min_len, max_len)
        
        # Choose a random start position (ensuring the subsequence fits)
        start_idx = random.randint(0, len(seq) - sub_len)
        end_idx = start_idx + sub_len
        
        # Extract the removed subsequence
        removed_subseq = seq[start_idx:end_idx]
        
        # Replace it with '[MASK]'
        masked_seq = seq[:start_idx] + "[MASK]" + seq[end_idx:]
        
        # Store results
        masked_sequences.append(masked_seq)
        removed_subsequences.append(removed_subseq)
    
    return masked_sequences, removed_subsequences

In [39]:
masked, removed = mask_dna_sequences(dna_sequences[:100])
masked, removed

(['TGGAGTGGAGTTTTCCT[MASK]AGGAGCCATGCCTA',
  'CTCCTCCTGCCCTTCCTCATGAACCCC[MASK]CGCGCCTCTGCCTTCTCATCCCTGC',
  'ACTGTGCCC[MASK]CCTGCTCCCCTAGGCACTACAGTGGGGCCCTTGGTTGCAA',
  'GAGGCAGGGAAGGC[MASK]CAAAACTGGGAGAGGGAGAGAGTGTT',
  '[MASK]ACTCAGGGAAGGCTTCCCTAACTGCCTCCC',
  'CTCCCACCTTGGCCTCC[MASK]GTGCTGAGATTACAGGCATGAGCCAC',
  'CTCGCTCTGTCACCCAGGGTGGAGTGCAGTGATGGGGTCCACTGCAACCTCCA[MASK]AGGTTCAAGTGATTCTCCTGCCTCAGCCTCCCGAGTAGGTGAGAC',
  'ACCTGGATAGGCACAGTTTTCAGAAATCCTTTCAA[MASK]TACCAATTCTCCTGTCTACAAGTTTACCTTCCTC',
  '[MASK]AGCAGCAGCCTCAGAAGCAAAAGTTGTCTCTGACCTTCTCCTGCCCTCCTGTCTCTCAGTCTCATTCT',
  'AATCCATTCCCACAAGCACACAGCACAGCTAAACCAGCTCCAAGGAG[MASK]GAGTGTC',
  'GCAGGAGGCTG[MASK]GGGAGGATCCTTTGAGCCTAGGAGTT',
  'AGGCGT[MASK]ACCGTGCCTGGCCAAGTCTAGCACCTTTTAAA',
  'GTGTGCCTGCAAAGATGGTAGAGTAGATGACGGGTTGGGCCAGGGGATTAA[MASK]GTACGGGA',
  'GTAGTACGTGTCGTGTAGTACGATGTCTAGTGATGAGTTTGCTAATACA[MASK]CAGTCAGGCCACCTACGGT',
  'ACTACGGCGGACTAATCTTCAAC[MASK]ACATACTTCCCCCATTATTCCT',
  'CGAACCTGATCTCTTA[MASK]GTATCCTTAATCAT

In [42]:

# Tokenize and predict
inputs = tokenizer(masked, padding='longest', return_tensors="pt")
print('Tokenized : ', tokenizer.batch_decode(inputs['input_ids']))
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted token
masked_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
predicted_token_id = outputs.logits[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)

print("Predicted Token:", predicted_token)
print("Real Tokens : ", removed)

Tokenized :  ['[CLS] TGGA GTGGA GTTTT CC T [MASK] A GGA GCCATG CCTA [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] CTCCTCC TGCC CTT CCTCA TGAA CCCC [MASK] CGC GCCTCTG CCTT CTCA TCCCTG C [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] A CTG TGCC C [MASK] CCTG CTCC CCTAGG CACTA CAGTG GGGCC CTTGGTT GCAA [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] GAGGCAGG GAAGG C [MASK] CAAAA CTGGGA GAGGGAGA GAGTGTT [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] [MASK] A CTCA GGGAA GGCTT CCCTAA CTG CCTCC C [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] CTCC CACCTT GGCCTCC [MASK] GTGCTGA GATTA CAGG CATGA GCCA C [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [41]:
count_positional_matches(predicted_token.split(), removed)

0

In [16]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

# example sequence and tokenization
sequences = ['ATATACGGCCGNC','GGGTATCGCTTCCGAC']
tokens = tokenizer(sequences,padding="longest")['input_ids']
print(f"Tokenzied sequence: {tokenizer.batch_decode(tokens)}")

# inference
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
inputs = tokenizer(sequences, truncation=True, padding='max_length', max_length=512, 
                   return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
outs = model(
    **inputs,
    output_hidden_states=True
)

# get the final layer embeddings and prediction logits
embeddings = outs['hidden_states'][-1].detach().numpy()
logits = outs['logits'].detach().numpy()


Tokenzied sequence: ['[CLS] A TATA CGGCC G [UNK] C [SEP]', '[CLS] GGGTA TCGCTT CCGA C [SEP] [PAD] [PAD]']
