# **Data preparation**

## **Generating non redundant datasets**

In [3]:
!pip install biopython



In [4]:
import pandas as pd
import numpy as np
import os
from Bio import SeqIO

In [6]:
def filter_redundancy(unique_ids, total_dataset, cols):
    unique_ids = pd.read_csv(unique_ids, header=None, sep='\t', names=cols)
    total_df = pd.read_csv(total_dataset, header=None, sep='\t', names=cols)
    filtered = total_df[total_df.iloc[:,0].isin(unique_ids.iloc[:,0])]
    return filtered

positives_cols = ['id', 'organism_name', 'kingdom', 'sequence_length', 'cleavage_site']
negatives_cols = ['id', 'organism_name', 'kingdom', 'sequence_length', 'transmembrane']
non_redundant_pos = filter_redundancy('../2_data_preparation/non_redundant/uniq.pos.tsv', '../1_data_collection/datasets/positive.tsv', positives_cols)
non_redundant_neg = filter_redundancy('../2_data_preparation/non_redundant/uniq.neg.tsv', '../1_data_collection/datasets/negative.tsv', negatives_cols)
print(len(non_redundant_neg))
print(len(non_redundant_pos))

8934
1093


## **Dividing in train and test set**

In [7]:
def split_train_test(df: pd.DataFrame, split: int):
    random_seed = 42
    shuffle_df = df.sample(frac=1, random_state=random_seed)
    split_idx = split*len(df.index)//100
    training_set = shuffle_df.iloc[:split_idx,:].copy()
    test_set = shuffle_df.iloc[split_idx:,:].copy()
    return training_set, test_set

training_set_pos, test_set_pos = split_train_test(non_redundant_pos, 80)
training_set_neg, test_set_neg = split_train_test(non_redundant_neg, 80)

In [8]:
print(f'the length of the positive set is {len(training_set_pos)+len(test_set_pos)}; training:{len(training_set_pos)}, testing{len(test_set_pos)}')
print(f'the length of the negative set is {len(training_set_neg)+len(test_set_neg)}; training:{len(training_set_neg)}, testing{len(test_set_neg)}')


the length of the positive set is 1093; training:874, testing219
the length of the negative set is 8934; training:7147, testing1787


## **Dividing the training set in 5 validation sets**

In [9]:
bins = [0, 25, 50, 75, 100]
bin_labels = [1, 2, 3, 4, 5]
training_set_pos.loc[:, 'temp_row_num'] = training_set_pos.index.to_numpy(dtype=np.int64)

training_set_pos.loc[:, 'validation_n'] = pd.qcut(training_set_pos['temp_row_num'], q=5, labels=bin_labels)

training_set_neg.loc[:, 'temp_row_num'] = training_set_neg.index.to_numpy(dtype=np.int64)

training_set_neg.loc[:, 'validation_n'] = pd.qcut(training_set_neg['temp_row_num'], q=5, labels=bin_labels)

In [10]:
training_set_pos["sp_type"] = 1
training_set_neg["sp_type"] = 0
test_set_pos["sp_type"] = 1
test_set_neg["sp_type"] = 0

In [11]:
training_set_neg = training_set_neg.drop(columns=['temp_row_num'])
training_set_pos = training_set_pos.drop(columns=['temp_row_num'])

In [13]:
training_set = pd.concat([training_set_pos, training_set_neg])
test_set = pd.concat([test_set_pos, test_set_neg])

training_set.to_csv('../2_data_preparation/sets/training_set.tsv', sep='\t', index=False)
test_set.to_csv('../2_data_preparation/sets/test_set.tsv', sep='\t', index=False)