<a href="https://colab.research.google.com/github/avrymi-asraf/IDL-huji/blob/main/ex1/ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import plotly.express as px

In [3]:
# # for clab
# !git clone https://github.com/avrymi-asraf/IDL-huji.git
# !mv /content/IDL-huji/ex1/ex1_data* .

In [4]:
# make data into 180 vector
def load_raw():
    raw_neg_data = open('C:\\Users\H\PycharmProjects\IDL-huji\ex1\ex1_data\\neg_A0201.txt', 'r').read().split('\n')
    raw_pos_data = open('C:\\Users\H\PycharmProjects\IDL-huji\ex1\ex1_data\pos_A0201.txt', 'r').read().split('\n')
    return raw_neg_data, raw_pos_data
raw_neg_data , raw_pos_data = load_raw()

amino_to_ind = {i:c for c,i in enumerate(set("".join(raw_neg_data)))}
print(amino_to_ind)

{'A': 0, 'I': 1, 'E': 2, 'M': 3, 'Y': 4, 'V': 5, 'C': 6, 'L': 7, 'K': 8, 'S': 9, 'W': 10, 'G': 11, 'D': 12, 'H': 13, 'R': 14, 'P': 15, 'N': 16, 'Q': 17, 'T': 18, 'F': 19}


In [5]:
def peptide2vec(peptides):
    t = torch.zeros(len(peptides),len(amino_to_ind) * len(peptides[0]))
    for j,peptide in enumerate(peptides):
        for i,amino in enumerate(peptide):
            t[j, i*len(amino_to_ind) +amino_to_ind[amino]] = 1
    return t

In [6]:
#print(peptide2vec(load_raw()[1]))
# t = peptide2vec(load_raw()[0][:10])
# px.imshow(t)

In [7]:
def load_vec_data():
    raw_neg_data , raw_pos_data = load_raw()
    neg_data = peptide2vec(raw_neg_data)
    pos_data = peptide2vec(raw_pos_data)
    return neg_data, pos_data


In [8]:
def split_train_test(ratio=0.9):
    neg_data, pos_data = load_vec_data()
    shuffle_pos = torch.randperm(len(pos_data))
    num_train_pos = int(ratio*len(pos_data))
    idx_train_pos = shuffle_pos[num_train_pos:]
    idx_test_pos = shuffle_pos[:num_train_pos]
    
    shuffle_neg = torch.randperm(len(neg_data))
    num_train_neg = int(ratio*len(neg_data))
    idx_train_neg = shuffle_neg[num_train_neg:]
    idx_test_neg = shuffle_neg[:num_train_neg]
    
    return pos_data[idx_train_pos],pos_data[idx_test_pos], neg_data[idx_train_neg], neg_data[idx_test_neg]  

In [45]:
from torch.utils.data import WeightedRandomSampler


def make_data_set():
    pos_train, pos_test, neg_train, neg_test = split_train_test()
    train_data = torch.cat((pos_train, neg_train))
    train_labels = torch.cat((torch.ones(len(pos_train)), torch.zeros(len(neg_train))))
    train_data_set = TensorDataset(train_data, train_labels)
    
    test_data = torch.cat((pos_test, neg_test))
    test_labels = torch.cat((torch.ones(len(pos_test)), torch.zeros(len(neg_test))))
    
    class_count = torch.bincount(train_labels.to(int))
    class_weights = 1. / class_count.float()
    sample_weights = class_weights[train_labels.to(int)]
    train_sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    return train_data_set, train_sampler, test_data, test_labels

# todo: use BCEWithLogitsLoss whith pos_weight to balance the classes

# todo: another option: use a dataloader, put the pos len(neg/pos) times in the dataset.


In [46]:
def make_unbiased_data_loader(batch_size=16):
    train_data_set, sampler, test_data, test_labels = make_data_set()
    train_loader = DataLoader(train_data_set, batch_size=batch_size, sampler=sampler)
    return train_loader, test_data, test_labels

In [72]:
class MLP_multi_perceptron(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP_multi_perceptron, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    def predict(self,x,threshold=0.5):
        return (self.forward(x)>threshold).to(float)

In [73]:
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 0.001
HIDDEN_SIZE = 100
INPUT_SIZE = len(amino_to_ind) * 9
OUTPUT_SIZE = 1

In [74]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
train_loader ,test_data,test_labels= make_unbiased_data_loader(BATCH_SIZE)
model = MLP_multi_perceptron(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [75]:
def train_model(model, train_loader, test_data, test_labels, loss_fn, optimizer, epochs):
    records = pd.DataFrame(columns=['epoch', 'train_loss', 'test_loss'],index=range(epochs))
    for epoch in range(epochs):
        model.train()
        train_epoch_loss = 0
        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(DEVICE), y.to(DEVICE)
            cur_loss = loss_fn(model(x).flatten(), y)
            train_epoch_loss += cur_loss.item()
            cur_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        model.eval()
        with torch.no_grad():
            prod = model.predict(test_data)
            accuracy = torch.mean((prod == test_labels).to(float)).item()
            print("epoch: ", epoch, accuracy)
            test_loss = loss_fn(model(test_data).flatten(), test_labels).item()
        records.iloc[epoch] = [epoch, train_epoch_loss, test_loss]   
    return  records

In [76]:
record_data = train_model(model,train_loader,test_data,test_labels,loss_fn,optimizer,EPOCHS)

epoch:  0 0.6755543885460799
epoch:  1 0.7207199892746272
epoch:  2 0.7267294179429914
epoch:  3 0.7315053323057439
epoch:  4 0.748521504008796
epoch:  5 0.7525383326450183
epoch:  6 0.7470349611276744
epoch:  7 0.7606985042052181
epoch:  8 0.7660121042909296
epoch:  9 0.768574018617969


In [77]:
px.line(record_data)