In [1]:
#Import necessary packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification,BertForTokenClassification
from sklearn.model_selection import train_test_split 
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('csv_files/finalized_complete_NLS_sequence_table.csv')

In [3]:
protein_sequences= df["Sequence_full"].tolist()

In [4]:
# Initialize labels list
labels = []

# Iterate over sequences
for index, row in df.iterrows():
    sequence = row['Sequence_full']
    begin = row['Begin']
    end = row['End']
    
    # Initialize label for the sequence
    sequence_labels = [0] * len(sequence)  # Initially set all labels to 0
    
    # Mark the signal region with 1s
    for i in range(begin - 1, end):
        sequence_labels[i] = 1
    
    labels.append(sequence_labels)

# Print the encoded labels
for seq, label in zip(df['Sequence_full'], labels):
    print(seq, label)


MPYKLKKEKEPPKVAKCTAKPSSSGKDGGGENTEEAQPQPQPQPQPQAQSQPPSSNKRPSNSTPPPTQLSKIKYSGGPQIVKKERRQSSSRFNLSKNRELQKLPALKDSPTQEREELFIQKLRQCCVLFDFVSDPLSDLKFKEVKRAGLNEMVEYITHSRDVVTEAIYPEAVTMFSVNLFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIAKKYIDQKFVLALLDLFDSEDPRERDFLKTILHRIYGKFLGLRAYIRRQINHIFYRFIYETEHHNGIAELLEILGSIINGFALPLKEEHKMFLIRVLLPLHKVKSLSVYHPQLAYCVVQFLEKESSLTEPVIVGLLKFWPKTHSPKEVMFLNELEEILDVIEPSEFSKVMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAARVLPIMFPALYRNSKSHWNKTIHGLIYNALKLFMEMNQKLFDDCTQQYKAEKQKGRFRMKEREEMWQKIEELARLNPQYPMFRAPPPLPPVYSMETETPTAEDIQLLKRTVETEAVQMLKDIKKEKVLLRRKSELPQDVYTIKALEAHKRAEEFLTASQEAL [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [27]:
protein_sequences

['MPYKLKKEKEPPKVAKCTAKPSSSGKDGGGENTEEAQPQPQPQPQPQAQSQPPSSNKRPSNSTPPPTQLSKIKYSGGPQIVKKERRQSSSRFNLSKNRELQKLPALKDSPTQEREELFIQKLRQCCVLFDFVSDPLSDLKFKEVKRAGLNEMVEYITHSRDVVTEAIYPEAVTMFSVNLFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIAKKYIDQKFVLALLDLFDSEDPRERDFLKTILHRIYGKFLGLRAYIRRQINHIFYRFIYETEHHNGIAELLEILGSIINGFALPLKEEHKMFLIRVLLPLHKVKSLSVYHPQLAYCVVQFLEKESSLTEPVIVGLLKFWPKTHSPKEVMFLNELEEILDVIEPSEFSKVMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAARVLPIMFPALYRNSKSHWNKTIHGLIYNALKLFMEMNQKLFDDCTQQYKAEKQKGRFRMKEREEMWQKIEELARLNPQYPMFRAPPPLPPVYSMETETPTAEDIQLLKRTVETEAVQMLKDIKKEKVLLRRKSELPQDVYTIKALEAHKRAEEFLTASQEAL',
 'MLTCNKAGSRMVVDAANSNGPFQPVVLLHIRDVPPADQEKLFIQKLRQCCVLFDFVSDPLSDLKWKEVKRAALSEMVEYITHNRNVITEPIYPEVVHMFAVNMFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIAKKYIDQKFVLQLLELFDSEDPRERDFLKTTLHRIYGKFLGLRAYIRKQINNIFYRFIYETEHHNGIAELLEILGSIINGFALPLKEEHKIFLLKVLLPLHKVKSLSVYHPQLAYCVVQFLEKDSTLTEPVVMALLKYWPKTHSPKEVMFLNELEEILDVIEPSEFVKIMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAAKILPIMFPSLYRNSKTHWNKTIHGLIY

In [59]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [60]:
# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize protein sequences using a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 512  # Adjust according to the sequences' lengths
tokenized_sequences = [tokenizer.encode(seq, add_special_tokens=True, max_length=max_length, truncation=True) for seq in df['Sequence_full']]

# Pad sequences to a fixed length
padded_sequences = torch.tensor([seq + [0] * (max_length - len(seq)) for seq in tokenized_sequences])

# Encode localization signals
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Sequence_nls'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
train_dataset = TensorDataset(X_train, torch.tensor(y_train))
test_dataset = TensorDataset(X_test, torch.tensor(y_test))

# Define data loaders
batch_size = 8  # Adjust batch size based on your resources
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3  # Adjust as needed
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        model.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    # Calculate average training loss
    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    total_eval_loss = 0
    for batch in tqdm(test_dataloader, desc="Validation"):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
    
    # Calculate average validation loss
    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    print(f"Epoch {epoch+1}:")
    print(f"Avg. Training Loss: {avg_train_loss:.4f}")
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

# Evaluation (if needed)
# Perform evaluation on a separate test dataset using similar steps as validation above


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 137/137 [36:32<00:00, 16.01s/it]  
Validation: 100%|██████████| 35/35 [01:00<00:00,  1.72s/it]


Epoch 1:
Avg. Training Loss: 6.5473
Avg. Validation Loss: 6.5318


Epoch 2: 100%|██████████| 137/137 [45:28<00:00, 19.92s/it]
Validation: 100%|██████████| 35/35 [01:53<00:00,  3.23s/it]


Epoch 2:
Avg. Training Loss: 6.4213
Avg. Validation Loss: 6.5755


Epoch 3: 100%|██████████| 137/137 [48:29<00:00, 21.24s/it]
Validation: 100%|██████████| 35/35 [01:45<00:00,  3.00s/it]

Epoch 3:
Avg. Training Loss: 6.3117
Avg. Validation Loss: 6.6084





In [61]:
# Evaluation
model.eval()
predictions = []
true_labels = []
total_eval_loss = 0

# Turn off gradient calculations
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluation"):
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        # Store predictions and true labels
        predictions.extend(np.argmax(logits, axis=1))
        true_labels.extend(label_ids)

# Calculate average evaluation loss
avg_eval_loss = total_eval_loss / len(test_dataloader)
print(f"Avg. Evaluation Loss: {avg_eval_loss:.4f}")

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='macro')
recall = recall_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Evaluation: 100%|██████████| 35/35 [01:47<00:00,  3.08s/it]

Avg. Evaluation Loss: 6.6084
Accuracy: 0.0110
Precision: 0.0001
Recall: 0.0047
F1-score: 0.0001



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
