In [2]:
import torch
from transformers import AutoTokenizer, AutoModel,BertConfig

# Import necessary modules from Hugging Face Transformers library:
# - AutoTokenizer: Automatically manages various tokenizer models.
# - AutoModel: Handles different model architectures, such as BERT.
# - BertConfig: Configuration object for defining the BERT model structure and parameters.
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True,config=config)

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from Bio import SeqIO

Nucleus_data = "./data/TrainingSet/mRNA_sublocation_TrainingSet_nucleus_data.fasta"
Cytoplasm_data = "./data/TrainingSet/mRNA_sublocation_TrainingSet_cytoplasm_data.fasta"

# Function to read FASTA file and return the first 'n' records
def read_fasta_file(file_path):
    sequences = []
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        sequence = str(record.seq)
        sequences.append((record.id, sequence))
    return sequences


Nucleus_data = read_fasta_file(Nucleus_data)
Cytoplasm_data = read_fasta_file(Cytoplasm_data)

In [4]:
def extract_elements(input_list):
    # Initialize an empty list to store the extracted elements.
    extracted_elements = []
    # Iterate over each sublist in the input list.
    for item in input_list:
        if len(item) >= 2:  
            extracted_elements.append(item[1])
    
    return extracted_elements

In [5]:
Nucleus_data = extract_elements(Nucleus_data)
Cytoplasm_data = extract_elements(Cytoplasm_data)

In [6]:
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')

def dnabert(sequences):
    feature = []
    # Process each DNA sequence in the input list
    for sequence in tqdm(sequences):
         # Tokenize the sequence.
        token = tokenizer(sequence, return_tensors = 'pt', padding=True, )["input_ids"]
        # Disable gradient computation to save memory.
        with torch.no_grad():
            # Feed the tokenized sequence to the BERT model.
            outputs = model(token)[0]
            # Compute the mean embedding of the outputs.
            embedding_mean = torch.mean(outputs[0], dim=0)
             # Compute the mean embedding of the outputs.
            feature.append(embedding_mean)
    return feature

In [7]:
Nucleus_BERT = dnabert(Nucleus_data)
Cytoplasm_BERT = dnabert(Cytoplasm_data)

100%|████████████████████████████████████████████████████████████████████████████| 2028/2028 [1:41:34<00:00,  3.01s/it]
100%|████████████████████████████████████████████████████████████████████████████| 2634/2634 [1:22:57<00:00,  1.89s/it]


In [8]:
import torch
import numpy as np
import pandas as pd

# Assuming Nucleus_BERT and Cytoplasm_BERT are lists of tensors

# Convert tensors to NumPy arrays
Nucleus_BERT = [tensor.numpy() for tensor in Nucleus_BERT]
Cytoplasm_BERT = [tensor.numpy() for tensor in Cytoplasm_BERT]

# Create DataFrames from NumPy arrays
Nucleus_BERT = pd.DataFrame(Nucleus_BERT)

# Specify the path for saving the CSV file
csv_file_path = "Nucleus_BERT.csv"

# Save the DataFrame to a CSV file without the index
Nucleus_BERT.to_csv(csv_file_path, index=False)

# Create DataFrames from NumPy arrays for Cytoplasm_BERT
Cytoplasm_BERT = pd.DataFrame(Cytoplasm_BERT)

# Specify the path for saving the CSV file
csv_file_path = "Cytoplasm_BERT.csv"

# Save the DataFrame to a CSV file without the index
Cytoplasm_BERT.to_csv(csv_file_path, index=False)


In [9]:
import warnings
warnings.filterwarnings(action='ignore')  # Suppress warnings to avoid unnecessary output.

feature = []  # Initialize an empty list to store feature embeddings.

# Iterate over each DNA sequence in Nucleus_data
for sequence in Nucleus_data:
    
    # Tokenize the sequence using the tokenizer.
    token = tokenizer(sequence, return_tensors='pt', padding=True)["input_ids"]
    
    # Disable gradient calculation to save memory during inference.
    with torch.no_grad():
        
        # Feed the tokenized sequence to the BERT model.
        outputs = model(token)[0]  # Assume outputs is a tensor with model output.
        
        # Compute the mean embedding of the output tensor.
        embedding_mean = torch.mean(outputs[0], dim=0)  # Compute the mean across the sequence dimension.
        
        # Append the computed embedding to the feature list.
        feature.append(embedding_mean)



In [10]:
import torch
import numpy as np
import pandas as pd

feature = [tensor.numpy() for tensor in feature]
df = pd.DataFrame(feature)
csv_file_path = "Nucleus_BERT.csv"
df.to_csv(csv_file_path, index=False)

In [14]:
import pandas as pd
csv_file_path = 'Nucleus_BERT.csv'
Nucleus= pd.read_csv(csv_file_path)
Nucleus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.051352,0.110863,0.017913,-0.117001,-0.130711,-0.051733,-0.087305,-0.113558,0.015367,0.044985,...,0.067846,-0.047920,-0.044799,0.009836,-0.027684,-0.060108,0.051503,0.074873,0.094912,0.142718
1,-0.028948,0.076648,0.053617,-0.151656,-0.125982,-0.033365,-0.052527,-0.106612,0.023717,0.048803,...,0.100743,-0.063734,-0.007245,0.007870,-0.022348,-0.066921,0.064694,0.101658,0.116233,0.136837
2,-0.053768,0.076979,0.011430,-0.088812,-0.123918,-0.040801,-0.099212,-0.122977,0.022925,0.024804,...,0.093461,-0.080484,-0.063357,0.037052,-0.023793,-0.052618,0.047666,0.088847,0.100192,0.156532
3,-0.013776,0.108522,0.030686,-0.113928,-0.116355,-0.049274,-0.067821,-0.104782,0.024167,0.016909,...,0.102133,-0.044457,-0.043053,0.013077,-0.001783,-0.055099,0.052132,0.093488,0.079941,0.162534
4,-0.041918,0.088144,0.067097,-0.108955,-0.116393,-0.039382,-0.087630,-0.126116,0.024942,0.020732,...,0.065886,-0.056244,-0.031527,0.027563,-0.006933,-0.035199,0.053570,0.106644,0.127546,0.129778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,-0.065482,0.058212,0.087450,-0.099524,-0.115115,-0.039661,-0.085792,-0.082386,0.037272,0.110976,...,0.065969,-0.090654,-0.121650,0.026158,0.039764,-0.044805,0.086667,0.070814,0.161227,0.155025
2024,-0.032713,0.058071,0.051727,-0.127587,-0.133989,-0.067111,-0.122108,-0.119116,0.032532,0.039487,...,0.047474,-0.082577,-0.079534,0.056675,-0.049686,-0.030281,0.011857,0.125576,0.122744,0.111504
2025,-0.052127,0.064980,0.110142,-0.152069,-0.109950,-0.031362,-0.082754,-0.046401,0.027816,0.078237,...,0.069121,-0.100153,-0.106001,0.048932,-0.003868,-0.047066,0.057358,0.095477,0.183928,0.146542
2026,-0.029458,0.147538,0.046201,-0.070029,-0.094834,-0.015033,-0.077472,-0.089705,0.003579,0.044217,...,0.085879,-0.059654,-0.015829,0.003033,-0.013082,-0.035074,0.070929,0.069388,0.090325,0.167693


In [15]:
csv_file_path = 'Cytoplasm_BERT.csv'
Cytoplasm = pd.read_csv(csv_file_path)
Cytoplasm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.038683,0.099217,0.071029,-0.172517,-0.102961,-0.045533,-0.088665,-0.116615,0.042821,0.041199,...,0.083354,-0.054207,-0.067697,0.061139,0.011034,-0.031217,0.021183,0.149285,0.152610,0.154324
1,-0.029777,0.074477,0.080808,-0.091672,-0.101717,-0.072539,-0.096508,-0.118764,0.041104,0.036672,...,0.072461,-0.066581,-0.067741,0.043531,-0.034703,-0.033670,-0.000296,0.112364,0.104377,0.137735
2,-0.025191,0.086182,0.051297,-0.123063,-0.074201,-0.046324,-0.065256,-0.083716,-0.035255,0.077267,...,0.077912,-0.090654,-0.062982,0.063398,0.005881,-0.050979,0.021004,0.043983,0.124656,0.085392
3,-0.073419,0.104642,0.051115,-0.131540,-0.109487,-0.046536,-0.085996,-0.094143,0.011976,0.040443,...,0.086169,-0.066325,-0.021002,0.038939,-0.004247,-0.049874,0.048066,0.086488,0.109812,0.127672
4,-0.040243,0.075767,0.042241,-0.108171,-0.142914,-0.059885,-0.070055,-0.092079,0.015970,0.069475,...,0.056925,-0.077418,-0.048793,0.029728,-0.017299,-0.066004,0.034396,0.110094,0.122443,0.163892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2629,-0.045684,0.073154,0.031592,-0.088793,-0.112148,-0.057297,-0.112139,-0.115460,0.027914,0.033002,...,0.065734,-0.071451,-0.064951,0.026554,-0.024152,-0.040399,0.054827,0.108673,0.107649,0.108352
2630,-0.051314,0.101583,0.049372,-0.130485,-0.146191,-0.078210,-0.049869,-0.120071,0.025753,0.044124,...,0.087611,-0.069669,-0.118523,0.001044,-0.004282,-0.044469,0.064367,0.085585,0.090057,0.146606
2631,-0.072383,0.114879,0.043321,-0.113051,-0.119378,-0.046503,-0.109755,-0.090195,0.016729,0.033405,...,0.056422,-0.073933,-0.074885,0.044387,-0.000744,-0.048722,-0.010145,0.098738,0.119175,0.117543
2632,-0.077676,0.075969,0.107718,-0.085082,-0.079427,-0.056554,-0.071221,-0.067641,0.003955,0.040569,...,0.044884,-0.071051,-0.068419,-0.018095,0.009988,-0.081945,0.103257,-0.006743,0.098912,0.106491


In [16]:
# Generate labels for 2028 entries in the Nucleus DataFrame, assigning label 1.
Nucleus_label = [1] * 2028
# Generate labels for 2634 entries in the Cytoplasm DataFrame, assigning label 0.
Cytoplasm_label  = [0] * 2634
Nucleus['label'] = Nucleus_label
Cytoplasm['label'] = Cytoplasm_label
# Concatenate the Nucleus and Cytoplasm DataFrames into a single DataFrame.
DNABERT_data = pd.concat([Nucleus, Cytoplasm], ignore_index=True)
DNABERT_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,-0.051352,0.110863,0.017913,-0.117001,-0.130711,-0.051733,-0.087305,-0.113558,0.015367,0.044985,...,-0.047920,-0.044799,0.009836,-0.027684,-0.060108,0.051503,0.074873,0.094912,0.142718,1
1,-0.028948,0.076648,0.053617,-0.151656,-0.125982,-0.033365,-0.052527,-0.106612,0.023717,0.048803,...,-0.063734,-0.007245,0.007870,-0.022348,-0.066921,0.064694,0.101658,0.116233,0.136837,1
2,-0.053768,0.076979,0.011430,-0.088812,-0.123918,-0.040801,-0.099212,-0.122977,0.022925,0.024804,...,-0.080484,-0.063357,0.037052,-0.023793,-0.052618,0.047666,0.088847,0.100192,0.156532,1
3,-0.013776,0.108522,0.030686,-0.113928,-0.116355,-0.049274,-0.067821,-0.104782,0.024167,0.016909,...,-0.044457,-0.043053,0.013077,-0.001783,-0.055099,0.052132,0.093488,0.079941,0.162534,1
4,-0.041918,0.088144,0.067097,-0.108955,-0.116393,-0.039382,-0.087630,-0.126116,0.024942,0.020732,...,-0.056244,-0.031527,0.027563,-0.006933,-0.035199,0.053570,0.106644,0.127546,0.129778,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,-0.045684,0.073154,0.031592,-0.088793,-0.112148,-0.057297,-0.112139,-0.115460,0.027914,0.033002,...,-0.071451,-0.064951,0.026554,-0.024152,-0.040399,0.054827,0.108673,0.107649,0.108352,0
4658,-0.051314,0.101583,0.049372,-0.130485,-0.146191,-0.078210,-0.049869,-0.120071,0.025753,0.044124,...,-0.069669,-0.118523,0.001044,-0.004282,-0.044469,0.064367,0.085585,0.090057,0.146606,0
4659,-0.072383,0.114879,0.043321,-0.113051,-0.119378,-0.046503,-0.109755,-0.090195,0.016729,0.033405,...,-0.073933,-0.074885,0.044387,-0.000744,-0.048722,-0.010145,0.098738,0.119175,0.117543,0
4660,-0.077676,0.075969,0.107718,-0.085082,-0.079427,-0.056554,-0.071221,-0.067641,0.003955,0.040569,...,-0.071051,-0.068419,-0.018095,0.009988,-0.081945,0.103257,-0.006743,0.098912,0.106491,0


In [17]:
# Define the file path for saving the CSV file.
csv_file_path = './data/TrainingSet/mRNA_sublocation_TrainingSet_DNABERT_data.csv'
# Save the DataFrame to a CSV file without including the index.
DNABERT_data.to_csv(csv_file_path, index=False)