<a href="https://colab.research.google.com/github/avrymi-asraf/IDL-huji/blob/main/ex1/ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import plotly.express as px

In [13]:
# for clab
# !git clone https://github.com/avrymi-asraf/IDL-huji.git
# !mv /content/IDL-huji/ex1/ex1_data* .

In [14]:
# make data into 180 vector
def load_raw(folder_path ='C:\\Users\H\PycharmProjects\IDL-huji\ex1\ex1_data\\'):
    raw_neg_data = open(folder_path+'neg_A0201.txt', 'r').read().split('\n')
    raw_pos_data = open(folder_path+'pos_A0201.txt', 'r').read().split('\n')
    return raw_neg_data, raw_pos_data
# raw_neg_data , raw_pos_data = load_raw()
raw_neg_data , raw_pos_data = load_raw('ex1_data/')

amino_to_ind = {i:c for c,i in enumerate(set("".join(raw_neg_data)))}


In [15]:
def peptide2vec(peptides):
    t = torch.zeros(len(peptides),len(amino_to_ind) * len(peptides[0]))
    for j,peptide in enumerate(peptides):
        for i,amino in enumerate(peptide):
            t[j, i*len(amino_to_ind) +amino_to_ind[amino]] = 1
    return t

In [16]:
#print(peptide2vec(load_raw()[1]))
# t = peptide2vec(load_raw()[0][:10])
# px.imshow(t)

In [17]:
def load_vec_data(raw_neg_data , raw_pos_data):
    neg_data = peptide2vec(raw_neg_data)
    pos_data = peptide2vec(raw_pos_data)
    return neg_data, pos_data


In [18]:
def split_train_test(neg_data, pos_data,ratio=0.9):
    shuffle_pos = torch.randperm(len(pos_data))
    num_train_pos = int(ratio*len(pos_data))
    idx_train_pos = shuffle_pos[num_train_pos:]
    idx_test_pos = shuffle_pos[:num_train_pos]

    shuffle_neg = torch.randperm(len(neg_data))
    num_train_neg = int(ratio*len(neg_data))
    idx_train_neg = shuffle_neg[num_train_neg:]
    idx_test_neg = shuffle_neg[:num_train_neg]

    return pos_data[idx_train_pos],pos_data[idx_test_pos], neg_data[idx_train_neg], neg_data[idx_test_neg]

#todo: try splitting after building data sets


In [19]:
from torch.utils.data import WeightedRandomSampler


def make_data_set(pos_train, pos_test, neg_train, neg_test):
    train_data = torch.cat((pos_train, neg_train))
    train_labels = torch.cat((torch.ones(len(pos_train)), torch.zeros(len(neg_train))))
    train_data_set = TensorDataset(train_data, train_labels)

    test_data = torch.cat((pos_test, neg_test))
    test_labels = torch.cat((torch.ones(len(pos_test)), torch.zeros(len(neg_test))))

    class_count = torch.bincount(train_labels.to(int))
    class_weights = 1. / class_count.float()
    sample_weights = class_weights[train_labels.to(int)]
    train_sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    return train_data_set, train_sampler, test_data, test_labels

# todo: use BCEWithLogitsLoss whith pos_weight to balance the classes

# todo: another option: use a dataloader, put the pos len(neg/pos) times in the dataset.


In [116]:
def make_unbiased_data_loader(train_data_set, test_data, test_labels,batch_size=16,sampler=None):
    train_loader = DataLoader(train_data_set, batch_size=batch_size,sampler=sampler)
    # train_loader = DataLoader(train_data_set, batch_size=batch_size)
    return train_loader, test_data, test_labels

In [286]:
class MLP_multi_perceptron(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP_multi_perceptron, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, hidden_size)
        self.fc3 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.relu(x)
        x = self.fc3(x)
        return x

In [287]:
def train_model(model,train_loader,test_data,test_labels,loss_fn,optimizer,epochs,device):
    record_data = []
    record_data_2 = []
    record_data_3 = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, (data, labels) in enumerate(train_loader):
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output.squeeze(), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        model.eval()
        with torch.no_grad():
            test_data, test_labels = test_data.to(device), test_labels.to(device)
            test_output = model(test_data)
            test_loss = loss_fn(test_output.squeeze(), test_labels).item()
            false_positive = ((test_output.squeeze() > 0) & (test_labels == 0)).sum().item()
            false_negative = ((test_output.squeeze() < 0) & (test_labels == 1)).sum().item()
            true_positive = ((test_output.squeeze() > 0) & (test_labels == 1)).sum().item()
            true_negative = ((test_output.squeeze() < 0) & (test_labels == 0)).sum().item()
            accuracy = (true_positive + true_negative) / len(test_labels)
            precision = true_positive / (true_positive + false_positive + 1e-6)
            recall = true_positive / (true_positive + false_negative + 1e-6)
            f1 = 2 * precision * recall / (precision + recall + 1e-6)
            print(f'epoch: {epoch}, train_loss: {train_loss}, test_loss: {test_loss}, accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1: {f1}')
        record_data.append({'epoch':epoch,'train_loss':train_loss,'test_loss':test_loss})
        record_data_2.append({'epoch':epoch,'accuracy':accuracy,'precision':precision,'recall':recall,'f1':f1})
        print(f'epoch: {epoch}, train_loss: {train_loss}, test_loss: {test_loss}')
    return pd.DataFrame(record_data), pd.DataFrame(record_data_2)

In [288]:
BATCH_SIZE = 16
EPOCHS = 25
LEARNING_RATE = 0.001
HIDDEN_SIZE = 7
INPUT_SIZE = len(amino_to_ind) * 9
OUTPUT_SIZE = 1

In [289]:
raw_neg_data, raw_pos_data = load_raw('ex1_data/')
# raw_neg_data, raw_pos_data = load_raw()
neg_data, pos_data = load_vec_data(raw_neg_data, raw_pos_data)
pos_train, pos_test, neg_train, neg_test = split_train_test(neg_data, pos_data)
train_data_set, sampler, test_data, test_labels = make_data_set(pos_train, pos_test, neg_train, neg_test)
train_loader, test_data, test_labels = make_unbiased_data_loader(train_data_set, test_data, test_labels,BATCH_SIZE)

In [290]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MLP_multi_perceptron(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
# loss_fn = torch.nn.BCELoss()
MULT = 0.9
LOSS_WHIGHT = torch.tensor([MULT *len(neg_train)/len(pos_train)]).to(DEVICE)
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=LOSS_WHIGHT)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [291]:
record_data = train_model(model,train_loader,test_data,test_labels,loss_fn,optimizer,EPOCHS,DEVICE)

epoch: 0, train_loss: 1.134126679668593, test_loss: 1.0661250352859497, accuracy: 0.89359207600566, precision: 0.6162790673787633, recall: 0.059063892994404205, f1: 0.10779645047906382
epoch: 0, train_loss: 1.134126679668593, test_loss: 1.0661250352859497
epoch: 1, train_loss: 0.9992452173732048, test_loss: 0.9372166991233826, accuracy: 0.8950879320800486, precision: 0.7298578164461715, recall: 0.05720653786879401, f1: 0.10609700601471214
epoch: 1, train_loss: 0.9992452173732048, test_loss: 0.9372166991233826
epoch: 2, train_loss: 0.8737311769103588, test_loss: 0.705594003200531, accuracy: 0.9066100667070952, precision: 0.5612572159842023, recall: 0.6500742939635683, f1: 0.6024091410353051
epoch: 2, train_loss: 0.8737311769103588, test_loss: 0.705594003200531
epoch: 3, train_loss: 0.6361404678589383, test_loss: 0.6133033037185669, accuracy: 0.9011522134627047, precision: 0.5328021246924829, recall: 0.7451708763948103, f1: 0.6213406937298016
epoch: 3, train_loss: 0.6361404678589383, tes

In [292]:
px.line(record_data[0],x='epoch',y=['train_loss','test_loss'])

In [293]:
px.line(pd.DataFrame(record_data[1]),x='epoch',y=['accuracy','precision','recall','f1'])

# hyper parameters
1. batch: 32, epochs: ->∞ , hidden: 25, score: 80

In [None]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

# Sample data (replace with your actual model output and labels)
pred = model(test_data.to('cuda')).detach().cpu().numpy()
true_labels = test_labels.detach().cpu().numpy()
# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(true_labels, pred)
roc_auc = auc(fpr, tpr)


In [None]:

# Create the Plotly figure
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr,
                         mode='lines',
                         line=dict(color='darkorange', width=2),
                         name=f'ROC curve (area = {roc_auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         line=dict(color='navy', width=2, dash='dash'),
                         showlegend=False))
fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1.05]),
    width=800,
    height=600
)

fig.show()