<a href="https://colab.research.google.com/github/avrymi-asraf/IDL-huji/blob/main/ex1/ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import plotly.express as px

In [2]:
# for clab
# !git clone https://github.com/avrymi-asraf/IDL-huji.git
# !mv /content/IDL-huji/ex1/ex1_data* .

In [3]:
# make data into 180 vector
def load_raw(folder_path ='C:\\Users\H\PycharmProjects\IDL-huji\ex1\ex1_data\\'):
    raw_neg_data = open(folder_path+'neg_A0201.txt', 'r').read().split('\n')
    raw_pos_data = open(folder_path+'pos_A0201.txt', 'r').read().split('\n')
    return raw_neg_data, raw_pos_data
# raw_neg_data , raw_pos_data = load_raw()
raw_neg_data , raw_pos_data = load_raw('ex1_data/')

amino_to_ind = {i:c for c,i in enumerate(set("".join(raw_neg_data)))}


In [4]:
def peptide2vec(peptides):
    t = torch.zeros(len(peptides),len(amino_to_ind) * len(peptides[0]))
    for j,peptide in enumerate(peptides):
        for i,amino in enumerate(peptide):
            t[j, i*len(amino_to_ind) +amino_to_ind[amino]] = 1
    return t

In [5]:
#print(peptide2vec(load_raw()[1]))
# t = peptide2vec(load_raw()[0][:10])
# px.imshow(t)

In [6]:
def load_vec_data(raw_neg_data , raw_pos_data):
    neg_data = peptide2vec(raw_neg_data)
    pos_data = peptide2vec(raw_pos_data)
    return neg_data, pos_data


In [7]:
def split_train_test(neg_data, pos_data,ratio=0.9):
    shuffle_pos = torch.randperm(len(pos_data))
    num_train_pos = int(ratio*len(pos_data))
    idx_train_pos = shuffle_pos[num_train_pos:]
    idx_test_pos = shuffle_pos[:num_train_pos]

    shuffle_neg = torch.randperm(len(neg_data))
    num_train_neg = int(ratio*len(neg_data))
    idx_train_neg = shuffle_neg[num_train_neg:]
    idx_test_neg = shuffle_neg[:num_train_neg]

    return pos_data[idx_train_pos],pos_data[idx_test_pos], neg_data[idx_train_neg], neg_data[idx_test_neg]

In [8]:
from torch.utils.data import WeightedRandomSampler


def make_data_set(pos_train, pos_test, neg_train, neg_test):
    train_data = torch.cat((pos_train, neg_train))
    train_labels = torch.cat((torch.ones(len(pos_train)), torch.zeros(len(neg_train))))
    train_data_set = TensorDataset(train_data, train_labels)

    test_data = torch.cat((pos_test, neg_test))
    test_labels = torch.cat((torch.ones(len(pos_test)), torch.zeros(len(neg_test))))

    class_count = torch.bincount(train_labels.to(int))
    class_weights = 1. / class_count.float()
    sample_weights = class_weights[train_labels.to(int)]
    train_sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    return train_data_set, train_sampler, test_data, test_labels

# todo: use BCEWithLogitsLoss whith pos_weight to balance the classes

# todo: another option: use a dataloader, put the pos len(neg/pos) times in the dataset.


In [9]:
def make_unbiased_data_loader(train_data_set, test_data, test_labels,batch_size=16,sampler=None):
    train_loader = DataLoader(train_data_set, batch_size=batch_size,sampler=sampler)
    # train_loader = DataLoader(train_data_set, batch_size=batch_size)
    return train_loader, test_data, test_labels

In [10]:
class MLP_multi_perceptron(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP_multi_perceptron, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        # out = self.sigmoid(out) todo: loss function is BCEWithLogits 
        return out
    def predict(self,x,threshold=0.5):
        # return (self.forward(x)>threshold).to(float) todo: loss function is BCE
        return (self.sigmoid(self.forward(x))>threshold).to(float)

In [11]:
def train_model(model, train_loader, test_data, test_labels, loss_fn, optimizer, epochs,device):
    test_data, test_labels = test_data.to(device), test_labels.to(device)
    records = pd.DataFrame(columns=['epoch', 'train_loss', 'test_loss'],index=range(epochs))
    for epoch in range(epochs):
        model.train()
        train_epoch_loss = 0
        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)
            cur_loss = loss_fn(model(x).flatten(), y)
            train_epoch_loss += cur_loss.item()
            cur_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        model.eval()
        with torch.no_grad():
            # prod = model.predict(test_data.to(device),threshold=0.99)
            prod = model.predict(test_data.to(device))
            accuracy = torch.mean((prod == test_labels).to(float)).item()
            false_pos = torch.mean(((prod == 1) & (test_labels == 0)).to(float)).item()
            true_pos = torch.mean(((prod == 1) & (test_labels == 1)).to(float)).item()
            false_neg = torch.mean(((prod == 0) & (test_labels == 1)).to(float)).item()
            true_neg = torch.mean(((prod == 0) & (test_labels == 0)).to(float)).item()
            print(f'epoch:{epoch} - accuracy: {accuracy*100:.2f}%, Precision: {(true_pos/(max(true_pos+false_pos,1e-10)))*100:.2f}:%, Recall {(true_pos/max(true_pos+false_neg,1e-10))*100:.2f}%')
            # print(f'epoch:{epoch} - accuracy: {accuracy*100:.2f}%, false positive: {false_pos*100:.2f}:%, false negative {false_neg*100:.2f}%')
            test_loss = loss_fn(model(test_data).flatten(), test_labels).item()
        records.iloc[epoch] = [epoch, train_epoch_loss, test_loss]
    return  records

In [12]:
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 0.001
HIDDEN_SIZE = 7
INPUT_SIZE = len(amino_to_ind) * 9
OUTPUT_SIZE = 1

In [13]:
raw_neg_data, raw_pos_data = load_raw('ex1_data/')
# raw_neg_data, raw_pos_data = load_raw()
neg_data, pos_data = load_vec_data(raw_neg_data, raw_pos_data)
pos_train, pos_test, neg_train, neg_test = split_train_test(neg_data, pos_data)
train_data_set, sampler, test_data, test_labels = make_data_set(pos_train, pos_test, neg_train, neg_test)
train_loader, test_data, test_labels = make_unbiased_data_loader(train_data_set, test_data, test_labels,BATCH_SIZE)

In [14]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MLP_multi_perceptron(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
# loss_fn = torch.nn.BCELoss()
LOSS_WHIGHT = torch.tensor([len(neg_train)/len(pos_train)]).to(DEVICE)
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=LOSS_WHIGHT)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [15]:
record_data = train_model(model,train_loader,test_data,test_labels,loss_fn,optimizer,EPOCHS,DEVICE)

epoch:0 - accuracy: 34.22%, Precision: 10.88:%, Recall 70.18%
epoch:1 - accuracy: 79.13%, Precision: 10.88:%, Recall 12.77%
epoch:2 - accuracy: 76.32%, Precision: 10.88:%, Recall 16.36%
epoch:3 - accuracy: 76.35%, Precision: 10.88:%, Recall 16.31%
epoch:4 - accuracy: 75.39%, Precision: 10.88:%, Recall 17.55%
epoch:5 - accuracy: 74.96%, Precision: 10.88:%, Recall 18.10%
epoch:6 - accuracy: 74.28%, Precision: 10.88:%, Recall 18.96%
epoch:7 - accuracy: 74.18%, Precision: 10.88:%, Recall 19.09%
epoch:8 - accuracy: 73.65%, Precision: 10.88:%, Recall 19.77%
epoch:9 - accuracy: 73.73%, Precision: 10.88:%, Recall 19.67%
epoch:10 - accuracy: 73.33%, Precision: 10.88:%, Recall 20.17%
epoch:11 - accuracy: 73.32%, Precision: 10.88:%, Recall 20.20%
epoch:12 - accuracy: 73.24%, Precision: 10.88:%, Recall 20.29%
epoch:13 - accuracy: 73.32%, Precision: 10.88:%, Recall 20.19%
epoch:14 - accuracy: 73.22%, Precision: 10.88:%, Recall 20.32%
epoch:15 - accuracy: 73.43%, Precision: 10.88:%, Recall 20.05%
ep

KeyboardInterrupt: 

In [None]:
px.line(record_data,x='epoch',y=['train_loss','test_loss'])

# hyper parameters
1. batch: 32, epochs: ->∞ , hidden: 25, score: 80

In [None]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

# Sample data (replace with your actual model output and labels)
pred = model(test_data.to('cuda')).detach().cpu().numpy()
true_labels = test_labels.detach().cpu().numpy()
# Calculate ROC curve data
fpr, tpr, thresholds = roc_curve(true_labels, pred)
roc_auc = auc(fpr, tpr)


In [None]:

# Create the Plotly figure
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr,
                         mode='lines',
                         line=dict(color='darkorange', width=2),
                         name=f'ROC curve (area = {roc_auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         line=dict(color='navy', width=2, dash='dash'),
                         showlegend=False))
fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1.05]),
    width=800,
    height=600
)

fig.show()