In [70]:
#Import necessary packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification,BertForTokenClassification
from sklearn.model_selection import train_test_split 
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [60]:
# Read the CSV file into a DataFrame
df = pd.read_csv('csv_files/finalized_complete_NLS_sequence_table.csv')

In [61]:
protein_sequences= df["Sequence_full"].tolist()

In [62]:
# Initialize labels list
labels = []

# Iterate over sequences
for index, row in df.iterrows():
    sequence = row['Sequence_full']
    begin = row['Begin']
    end = row['End']
    
    # Initialize label for the sequence
    sequence_labels = [0] * len(sequence)  # Initially set all labels to 0
    
    # Mark the signal region with 1s
    for i in range(begin - 1, end):
        sequence_labels[i] = 1
    
    labels.append(sequence_labels)

# Print the encoded labels
for seq, label in zip(df['Sequence_full'], labels):
    print(seq, label)


MPYKLKKEKEPPKVAKCTAKPSSSGKDGGGENTEEAQPQPQPQPQPQAQSQPPSSNKRPSNSTPPPTQLSKIKYSGGPQIVKKERRQSSSRFNLSKNRELQKLPALKDSPTQEREELFIQKLRQCCVLFDFVSDPLSDLKFKEVKRAGLNEMVEYITHSRDVVTEAIYPEAVTMFSVNLFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIAKKYIDQKFVLALLDLFDSEDPRERDFLKTILHRIYGKFLGLRAYIRRQINHIFYRFIYETEHHNGIAELLEILGSIINGFALPLKEEHKMFLIRVLLPLHKVKSLSVYHPQLAYCVVQFLEKESSLTEPVIVGLLKFWPKTHSPKEVMFLNELEEILDVIEPSEFSKVMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAARVLPIMFPALYRNSKSHWNKTIHGLIYNALKLFMEMNQKLFDDCTQQYKAEKQKGRFRMKEREEMWQKIEELARLNPQYPMFRAPPPLPPVYSMETETPTAEDIQLLKRTVETEAVQMLKDIKKEKVLLRRKSELPQDVYTIKALEAHKRAEEFLTASQEAL [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [63]:
len(protein_sequences)
len(labels)

1363

In [65]:
# Tokenize sequences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_sequences = [tokenizer.encode(seq, add_special_tokens=False) for seq in protein_sequences]

# Pad sequences
max_seq_length = max(len(seq) for seq in tokenized_sequences)
padded_sequences = [seq + [tokenizer.pad_token_id] * (max_seq_length - len(seq)) for seq in tokenized_sequences]

# Pad labels to match the length of sequences
padded_labels = [label + [0] * (5890 - len(label)) for label in labels]

# Convert labels to tensors
label_tensors = torch.tensor(padded_labels)

In [66]:
# Split dataset
train_sequences, test_sequences, train_labels, test_labels = train_test_split(padded_sequences, label_tensors, test_size=0.2, random_state=42)

# Convert sequences and labels to PyTorch tensors
train_sequences_tensor = torch.tensor(train_sequences)
test_sequences_tensor = torch.tensor(test_sequences)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_sequences_tensor, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(test_sequences_tensor, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32)

In [55]:
# Initialize model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
num_epochs= 5
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_inputs, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch_inputs, batch_labels in test_loader:
        outputs = model(input_ids=batch_inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.tolist())
        all_labels.extend(batch_labels.tolist())

# Compute evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)


ValueError: Expected input batch_size (1600) to match target batch_size (188480).