In [2]:
import torch
from transformers import AutoTokenizer, AutoModel,BertConfig

# Import necessary modules from Hugging Face Transformers library:
# - AutoTokenizer: Automatically manages various tokenizer models.
# - AutoModel: Handles different model architectures, such as BERT.
# - BertConfig: Configuration object for defining the BERT model structure and parameters.
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True,config=config)

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from Bio import SeqIO

Nucleus_data = "./data/TestSet/mRNA_sublocation_TestSet_nucleus_data.fasta"
Cytoplasm_data = "./data/TestSet/mRNA_sublocation_TestSet_cytoplasm_data.fasta"

# Function to read FASTA file and return the first 'n' records
def read_fasta_file(file_path):
    sequences = []
    for i, record in enumerate(SeqIO.parse(file_path, "fasta")):
        sequence = str(record.seq)
        sequences.append((record.id, sequence))
    return sequences


Nucleus_data = read_fasta_file(Nucleus_data)
Cytoplasm_data = read_fasta_file(Cytoplasm_data)

In [6]:
def extract_elements(input_list):
     # Initialize an empty list to store the extracted elements.
    extracted_elements = []
    # Iterate over each sublist in the input list.
    for item in input_list:
        if len(item) >= 2:  
            extracted_elements.append(item[1])
    
    return extracted_elements

In [8]:
Nucleus_data = extract_elements(Nucleus_data)
Cytoplasm_data = extract_elements(Cytoplasm_data)


In [10]:
import warnings
from tqdm import tqdm
warnings.filterwarnings(action='ignore')

def dnabert(sequences):
    feature = []
    # Process each DNA sequence in the input list
    for sequence in tqdm(sequences):
         # Tokenize the sequence.
        token = tokenizer(sequence, return_tensors = 'pt', padding=True, )["input_ids"]
        # Disable gradient computation to save memory.
        with torch.no_grad():
            # Feed the tokenized sequence to the BERT model.
            outputs = model(token)[0]
            # Compute the mean embedding of the outputs.
            embedding_mean = torch.mean(outputs[0], dim=0)
             # Compute the mean embedding of the outputs.
            feature.append(embedding_mean)
    return feature

In [11]:
Nucleus_BERT = dnabert(Nucleus_data)
Cytoplasm_BERT = dnabert(Cytoplasm_data)

100%|████████████████████████████████████████████████████████████████████████████████| 228/228 [11:43<00:00,  3.08s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [07:36<00:00,  1.57s/it]


In [12]:
import torch
import numpy as np
import pandas as pd

# Assuming Nucleus_BERT and Cytoplasm_BERT are lists of tensors

# Convert tensors to NumPy arrays
Nucleus_BERT = [tensor.numpy() for tensor in Nucleus_BERT]
Cytoplasm_BERT = [tensor.numpy() for tensor in Cytoplasm_BERT]

# Create DataFrames from NumPy arrays
Nucleus_BERT = pd.DataFrame(Nucleus_BERT)

# Specify the path for saving the CSV file
csv_file_path = "Nucleus_BERT.csv"

# Save the DataFrame to a CSV file without the index
Nucleus_BERT.to_csv(csv_file_path, index=False)

# Create DataFrames from NumPy arrays for Cytoplasm_BERT
Cytoplasm_BERT = pd.DataFrame(Cytoplasm_BERT)

# Specify the path for saving the CSV file
csv_file_path = "Cytoplasm_BERT.csv"

# Save the DataFrame to a CSV file without the index
Cytoplasm_BERT.to_csv(csv_file_path, index=False)


In [13]:
import warnings
warnings.filterwarnings(action='ignore')  # Suppress warnings to avoid unnecessary output.

feature = []  # Initialize an empty list to store feature embeddings.

# Iterate over each DNA sequence in Nucleus_data
for sequence in Nucleus_data:
    
    # Tokenize the sequence using the tokenizer.
    token = tokenizer(sequence, return_tensors='pt', padding=True)["input_ids"]
    
    # Disable gradient calculation to save memory during inference.
    with torch.no_grad():
        
        # Feed the tokenized sequence to the BERT model.
        outputs = model(token)[0]  # Assume outputs is a tensor with model output.
        
        # Compute the mean embedding of the output tensor.
        embedding_mean = torch.mean(outputs[0], dim=0)  # Compute the mean across the sequence dimension.
        
        # Append the computed embedding to the feature list.
        feature.append(embedding_mean)



In [14]:
import torch
import numpy as np
import pandas as pd

feature = [tensor.numpy() for tensor in feature]
df = pd.DataFrame(feature)
csv_file_path = "Nucleus_BERT.csv"
df.to_csv(csv_file_path, index=False)

In [15]:
import pandas as pd
csv_file_path = 'Nucleus_BERT.csv'
Nucleus= pd.read_csv(csv_file_path)
Nucleus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.029158,0.100403,0.067271,-0.116924,-0.109726,-0.056563,-0.095235,-0.063101,0.026661,0.055791,...,0.080308,-0.053218,-0.046008,0.024874,-0.005043,-0.057149,0.022523,0.123339,0.146821,0.165012
1,-0.051528,0.083082,0.069916,-0.125172,-0.106272,-0.032767,-0.076169,-0.062987,0.034608,0.097928,...,0.110263,-0.060201,-0.053725,0.023453,0.007158,-0.104437,0.018186,0.066489,0.168612,0.168197
2,-0.042795,0.084680,0.087846,-0.136758,-0.113154,-0.070079,-0.075557,-0.065823,0.027558,0.086018,...,0.094097,-0.086440,-0.074154,0.037891,0.010463,-0.048493,0.094847,0.098022,0.145162,0.164766
3,-0.011203,0.055997,0.090977,-0.172956,-0.144822,-0.070694,-0.123100,-0.074747,0.023865,0.037438,...,0.053678,-0.072469,-0.075094,0.079060,-0.053585,-0.033657,-0.002991,0.147035,0.143220,0.139921
4,-0.023741,0.064950,0.127039,-0.152309,-0.082382,-0.070835,-0.145091,-0.057588,0.054729,0.018183,...,0.106842,-0.060701,-0.105159,0.082175,-0.042828,-0.013750,-0.040331,0.205573,0.173816,0.187638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,-0.026316,0.104323,0.055210,-0.140533,-0.119295,-0.047682,-0.062932,-0.121822,0.022041,0.036620,...,0.095601,-0.048452,-0.007237,0.021239,-0.021181,-0.059708,0.082249,0.108367,0.105260,0.162868
224,-0.020238,0.047819,0.134550,-0.121638,-0.135974,-0.094798,-0.126838,-0.131943,0.054405,0.072603,...,0.052637,-0.129961,-0.113173,0.026893,0.058380,-0.092897,0.047230,0.011677,0.126219,0.192245
225,-0.036119,0.097049,0.067029,-0.113603,-0.077666,-0.070627,-0.074438,-0.117321,0.021768,0.062337,...,0.050450,-0.073496,-0.017835,0.041560,0.000900,-0.017476,0.022009,0.091376,0.090168,0.131675
226,-0.062857,0.094254,0.057334,-0.151865,-0.096824,-0.056928,-0.069792,-0.143615,0.009395,0.067408,...,0.058263,-0.023868,-0.056047,0.007927,0.038953,-0.035562,0.023845,0.120890,0.067947,0.161538


In [16]:
csv_file_path = 'Cytoplasm_BERT.csv'
Cytoplasm = pd.read_csv(csv_file_path)
Cytoplasm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.042189,0.107438,0.080789,-0.112513,-0.127146,-0.037983,-0.059289,-0.098743,0.031087,0.052883,...,0.096625,-0.068310,-0.041489,0.025254,0.012674,-0.040745,0.040516,0.105912,0.134756,0.157224
1,-0.042215,0.089809,0.078107,-0.119019,-0.105611,-0.055412,-0.092076,-0.121383,0.036334,0.045928,...,0.075161,-0.097369,-0.087646,0.011577,0.024423,-0.025878,0.073588,0.077925,0.110391,0.141546
2,-0.070403,0.073220,0.104448,-0.121054,-0.119362,-0.093688,-0.087088,-0.119679,0.042382,0.039526,...,0.029260,-0.103871,-0.086329,0.012559,0.008106,-0.032820,0.058929,0.082133,0.102700,0.129086
3,-0.032643,0.061018,0.091801,-0.127989,-0.139731,-0.077670,-0.053395,-0.108600,0.037156,0.075086,...,0.068130,-0.090856,-0.103651,0.025696,0.011325,-0.075645,0.082667,0.049499,0.129458,0.145019
4,-0.039035,0.081107,0.040233,-0.113893,-0.139828,-0.065178,-0.083956,-0.109940,0.015175,0.024564,...,0.092401,-0.084888,-0.045685,-0.014303,-0.005285,-0.030946,0.072814,0.097811,0.099510,0.167611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,-0.038825,0.066094,0.105642,-0.127595,-0.096262,-0.075581,-0.077916,-0.082450,0.028816,0.078428,...,0.074637,-0.083052,-0.081839,0.025406,0.034709,-0.057688,0.085145,0.059397,0.119146,0.114285
286,-0.048585,0.073876,0.038217,-0.139759,-0.106150,-0.056394,-0.057778,-0.105467,0.015478,0.053942,...,0.066384,-0.050953,-0.018395,-0.001208,-0.011797,-0.064337,0.017005,0.087882,0.118566,0.137715
287,-0.039191,0.074916,0.047489,-0.125525,-0.114935,-0.063233,-0.059096,-0.098679,-0.006991,0.081666,...,0.066718,-0.077407,-0.028342,0.029947,-0.024831,-0.072220,-0.001702,0.085262,0.091931,0.144309
288,-0.028589,0.071419,0.104370,-0.118703,-0.104961,-0.059899,-0.096033,-0.123387,0.024864,0.105596,...,0.057353,-0.115380,-0.046133,0.049324,-0.001430,-0.059596,0.059717,0.050933,0.101675,0.132865


In [17]:
# Generate labels for 228 entries in the Nucleus DataFrame, assigning label 1.
Nucleus_label = [1] * 228
# Generate labels for 290 entries in the Cytoplasm DataFrame, assigning label 0.
Cytoplasm_label  = [0] * 290
Nucleus['label'] = Nucleus_label
Cytoplasm['label'] = Cytoplasm_label
# Concatenate the Nucleus and Cytoplasm DataFrames into a single DataFrame.
DNABERT_data = pd.concat([Nucleus, Cytoplasm], ignore_index=True)
DNABERT_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,-0.029158,0.100403,0.067271,-0.116924,-0.109726,-0.056563,-0.095235,-0.063101,0.026661,0.055791,...,-0.053218,-0.046008,0.024874,-0.005043,-0.057149,0.022523,0.123339,0.146821,0.165012,1
1,-0.051528,0.083082,0.069916,-0.125172,-0.106272,-0.032767,-0.076169,-0.062987,0.034608,0.097928,...,-0.060201,-0.053725,0.023453,0.007158,-0.104437,0.018186,0.066489,0.168612,0.168197,1
2,-0.042795,0.084680,0.087846,-0.136758,-0.113154,-0.070079,-0.075557,-0.065823,0.027558,0.086018,...,-0.086440,-0.074154,0.037891,0.010463,-0.048493,0.094847,0.098022,0.145162,0.164766,1
3,-0.011203,0.055997,0.090977,-0.172956,-0.144822,-0.070694,-0.123100,-0.074747,0.023865,0.037438,...,-0.072469,-0.075094,0.079060,-0.053585,-0.033657,-0.002991,0.147035,0.143220,0.139921,1
4,-0.023741,0.064950,0.127039,-0.152309,-0.082382,-0.070835,-0.145091,-0.057588,0.054729,0.018183,...,-0.060701,-0.105159,0.082175,-0.042828,-0.013750,-0.040331,0.205573,0.173816,0.187638,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,-0.038825,0.066094,0.105642,-0.127595,-0.096262,-0.075581,-0.077916,-0.082450,0.028816,0.078428,...,-0.083052,-0.081839,0.025406,0.034709,-0.057688,0.085145,0.059397,0.119146,0.114285,0
514,-0.048585,0.073876,0.038217,-0.139759,-0.106150,-0.056394,-0.057778,-0.105467,0.015478,0.053942,...,-0.050953,-0.018395,-0.001208,-0.011797,-0.064337,0.017005,0.087882,0.118566,0.137715,0
515,-0.039191,0.074916,0.047489,-0.125525,-0.114935,-0.063233,-0.059096,-0.098679,-0.006991,0.081666,...,-0.077407,-0.028342,0.029947,-0.024831,-0.072220,-0.001702,0.085262,0.091931,0.144309,0
516,-0.028589,0.071419,0.104370,-0.118703,-0.104961,-0.059899,-0.096033,-0.123387,0.024864,0.105596,...,-0.115380,-0.046133,0.049324,-0.001430,-0.059596,0.059717,0.050933,0.101675,0.132865,0


In [18]:
# Define the file path for saving the CSV file.
csv_file_path = './data/TestSet/mRNA_sublocation_TestSet_DNABERT_data.csv'

# Save the DataFrame to a CSV file without including the index.
DNABERT_data.to_csv(csv_file_path, index=False)
