### performance and time

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
import time
import joblib

# Read data from CSV files
train_df = pd.read_csv('train_seq1.csv')  # Update with the correct file path
test_df = pd.read_csv('test_seq.csv')    # Update with the correct file path

# Extract sequences and labels from the dataframes
train_sequences = train_df['sequence'].tolist()
train_labels = train_df['label'].tolist()
test_sequences = test_df['sequence'].tolist()
test_labels = test_df['label'].tolist()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Encode peptide sequences
train_encodings = tokenizer(train_sequences, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_sequences, truncation=True, padding=True, return_tensors='pt')

# Convert labels to tensors
train_labels = torch.tensor(train_labels).long()
test_labels = torch.tensor(test_labels).long()

# Split data into train and test sets
train_inputs, test_inputs, train_masks, test_masks = train_encodings.input_ids, test_encodings.input_ids, \
                                                      train_encodings.attention_mask, test_encodings.attention_mask

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
batch_size = 16
epochs = 3

start_time = time.time()  # Start training time measurement

for epoch in range(epochs):
    for i in range(0, len(train_inputs), batch_size):
        batch_inputs = train_inputs[i:i + batch_size]
        batch_masks = train_masks[i:i + batch_size]
        batch_labels = train_labels[i:i + batch_size]

        optimizer.zero_grad()
        outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

end_time = time.time()  # End training time measurement
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")

# Save the model to a .pkl file
model_path = "biobert_model.pkl"
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

# Print the size of the model
model_size = joblib.os.path.getsize(model_path)
print(f"Model size: {model_size} bytes")

# Evaluation on test set
model.eval()
with torch.no_grad():
    outputs = model(test_inputs, attention_mask=test_masks)
    logits = outputs.logits
    predictions = np.argmax(logits.cpu().numpy(), axis=1)

# Classification report
print(classification_report(test_labels, predictions))

# Calculate additional evaluation metrics
accuracy = accuracy_score(test_labels, predictions)
mcc = matthews_corrcoef(test_labels, predictions)
auc = roc_auc_score(test_labels, logits[:, 1])  # Assuming the second class is the positive class
tn, fp, fn, tp = confusion_matrix(test_labels, predictions).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(f"Accuracy: {accuracy}")
print(f"MCC: {mcc}")
print(f"AUC: {auc}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training time: 1748.1826651096344 seconds
Model saved to biobert_model.pkl
Model size: 433354997 bytes
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1104
           1       0.87      0.87      0.87      1104

    accuracy                           0.87      2208
   macro avg       0.87      0.87      0.87      2208
weighted avg       0.87      0.87      0.87      2208

Accuracy: 0.8668478260869565
MCC: 0.733696856125056
AUC: 0.9354315991913463
Sensitivity: 0.8677536231884058
Specificity: 0.8659420289855072


In [21]:
train_df = pd.read_csv('train_seq1.csv').iloc[:,0:20] 
train_df

Unnamed: 0.1,Unnamed: 0,label,sequence
0,0,1,GIGSAILSAGKSIIKGLAKGLAEHF
1,1,1,GCCSVPPCIANHPELCV
2,2,1,GWCGDPGATCGKLRLYCCSGACDCYTKTCKDKSSA
3,3,1,GRCCHPACGQNTKC
4,4,0,AFIEGSRGYFQRELKRTDLDLLEKFNFEAALAT
...,...,...,...
8823,8823,0,IFCFLALVIAVASANKHGKNKDNAGP
8824,8824,0,AIVEQQGAPGLGRIINKK
8825,8825,1,GCCSDPRCAWRC
8826,8826,0,QADPNAFYGLM


In [45]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import pickle  # Import the pickle module

# Read data from CSV files
train_df = pd.read_csv('train_seq1.csv')#.iloc[0:20,:]  # Update with the correct file path

# Extract sequences from the dataframe
train_sequences = train_df['sequence'].tolist()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Encode peptide sequences
train_encodings = tokenizer(train_sequences, truncation=True, padding=True,max_length=35, return_tensors='pt')

# Load pre-trained BERT model
model = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Forward pass to extract features (embeddings)
with torch.no_grad():
    model.eval()
    outputs = model(**train_encodings)

# Extract the features (embeddings) from the last hidden state
features = outputs.last_hidden_state.numpy()

# Save features to a .pkl file
with open('Bio_bert_features.pkl', 'wb') as f:
    pickle.dump(features, f)

# embedding with mean pooling
mean_pooled_features = np.mean(features, axis=1)
mean_pooled_features.shape

# 'features' now contains the embeddings for each token in your input sequences and is saved in 'features.pkl'
# Save features to a .csv file
c = pd.DataFrame(mean_pooled_features)
print(c.shape)
c.to_csv("Bio_bert_train_features.csv")

(8828, 768)


In [46]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import pickle  # Import the pickle module

# Read data from CSV files
train_df = pd.read_csv('test_seq.csv')#.iloc[0:20,:]  # Update with the correct file path

# Extract sequences from the dataframe
train_sequences = train_df['sequence'].tolist()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Encode peptide sequences
train_encodings = tokenizer(train_sequences, truncation=True, padding=True,max_length=35, return_tensors='pt')

# Load pre-trained BERT model
model = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
print(model)
# Forward pass to extract features (embeddings)
with torch.no_grad():
    model.eval()
    outputs = model(**train_encodings)

# Extract the features (embeddings) from the last hidden state
features = outputs.last_hidden_state.numpy()

# Save features to a .pkl file
with open('Bio_bert_features.pkl', 'wb') as f:
    pickle.dump(features, f)

# embedding with mean pooling
mean_pooled_features = np.mean(features, axis=1)
mean_pooled_features.shape

# 'features' now contains the embeddings for each token in your input sequences and is saved in 'features.pkl'
# Save features to a .csv file
c = pd.DataFrame(mean_pooled_features)
print(c.shape)
c.to_csv("Bio_bert_test_features.csv")

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [39]:
c = features.reshape(len(features),-1)
print(c.shape)
df.to_csv("Bio_bert_features")

(20, 19968)


NameError: name 'df' is not defined

In [44]:
print(c.shape)

(10, 19968)


In [41]:
# embedding with mean pooling
mean_pooled_features = np.mean(features, axis=1)
mean_pooled_features.shape

(20, 768)

In [42]:
df = pd.DataFrame(c)

In [43]:
df.to_csv("Bio_bert_features")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19958,19959,19960,19961,19962,19963,19964,19965,19966,19967
0,0.295766,-0.248414,-0.174619,-0.081224,-0.133414,-0.332837,0.111752,0.159537,0.056456,0.287091,...,-0.362634,0.244347,-0.566376,-0.710691,-0.103561,0.409266,-0.127931,0.433246,-0.031696,-0.583235
1,0.55545,0.128328,-0.252557,-0.096326,-0.611277,-0.066753,0.003202,0.28338,0.160758,0.077789,...,0.15425,0.206976,-0.330134,-0.342158,-0.47545,0.042247,-0.045918,0.418576,0.211306,0.123509
2,0.471527,0.184619,-0.131111,0.104443,-0.11603,-0.482175,-0.096873,0.423787,-0.011152,-0.104417,...,0.712148,0.268008,-0.347479,-0.280486,-1.067829,0.532668,0.332722,-0.21292,-0.743492,-0.130254
3,0.69055,0.066964,0.048544,0.053613,-0.1961,-0.478306,0.021435,0.455768,0.093702,0.064175,...,0.1556,0.082402,-0.308487,-0.667787,-0.216188,0.260508,-0.023848,0.149997,-0.01891,0.155258
4,0.25741,-0.225721,0.288564,0.156947,-0.125648,-0.134172,-0.326167,0.118326,0.175957,-0.063314,...,-0.427944,0.303428,-0.338711,-0.75733,-0.185437,0.194008,0.093013,0.535545,-0.369737,-0.213575
5,0.501313,-0.00633,0.025337,-0.101124,-0.215039,-0.479141,0.070616,0.252544,0.052038,0.059564,...,0.048654,-0.058742,-0.219182,-0.525096,-0.238855,0.171851,0.033156,0.259201,-0.128199,-0.008079
6,0.436745,-0.093769,-0.019925,0.010236,-0.237587,-0.422568,0.082617,0.449262,0.091441,0.154673,...,0.110469,0.087394,-0.332398,-0.798675,-0.092132,0.256308,0.144653,0.21285,-0.075593,-0.326994
7,0.436316,-0.12613,-0.009946,-0.060373,-0.342109,-0.352349,0.086356,0.375431,0.111905,0.241896,...,-0.171949,0.048242,-0.331375,-0.538188,-0.260457,0.454257,0.111089,0.134799,-0.080075,0.171425
8,0.355946,-0.234255,0.06228,0.060911,-0.167959,-0.455376,-0.095367,0.01983,0.080931,0.135346,...,-0.338103,0.062188,-0.063671,-0.726631,-0.107139,0.62987,0.134321,0.392437,-0.055412,-0.093124
9,0.506715,0.199175,-0.091976,-0.013905,-0.36191,-0.187476,-0.039624,0.459731,0.167451,-0.067518,...,0.000247,-0.057924,-0.391357,-0.85109,-0.186147,0.102033,-0.081892,0.050635,-0.135018,-0.010355
