# **Data preparation**

## **Generating non redundant datasets**

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import pandas as pd
import numpy as np
import os
from Bio import SeqIO

In [None]:
def filter_redundancy(unique_ids, total_dataset):

    unique_ids = pd.read_csv(unique_ids, header=None, sep='\t')
    total_df = pd.read_csv(total_dataset, header=None, sep='\t')
    filtered = total_df[total_df.iloc[:,0].isin(unique_ids.iloc[:,0])]
    return filtered

non_redundant_pos = filter_redundancy('uniq.pos.tsv', 'positive.tsv')
non_redundant_neg = filter_redundancy('uniq.neg.tsv', 'negative.tsv')
print(len(non_redundant_neg))
print(len(non_redundant_pos))

8934
1093


## **Dividing in train and test set**

In [None]:
def split_train_test(df: pd.DataFrame, split: int):
    random_seed = 42
    shuffle_df = df.sample(frac=1, random_state=random_seed)
    split_idx = split*len(df.index)//100
    training_set = shuffle_df.iloc[:split_idx,:].copy()
    test_set = shuffle_df.iloc[split_idx:,:].copy()
    return training_set, test_set

training_set_pos, test_set_pos = split_train_test(non_redundant_pos, 80)
training_set_neg, test_set_neg = split_train_test(non_redundant_neg, 80)

In [None]:
print(f'the lenght of the positive set is {len(training_set_pos)+len(test_set_pos)}; training:{len(training_set_pos)}, testing{len(test_set_pos)}')
print(f'the lenght of the negative set is {len(training_set_neg)+len(test_set_neg)}; training:{len(training_set_neg)}, testing{len(test_set_neg)}')


the lenght of the positive set is 1093; training:874, testing219
the lenght of the negative set is 8934; training:7147, testing1787


## **Dividing the training set in 5 validation sets**

In [None]:
bins = [0, 25, 50, 75, 100]
bin_lables = [1, 2, 3, 4, 5]
training_set_pos.loc[:, 'temp_row_num'] = training_set_pos.index.to_numpy(dtype=np.int64)

training_set_pos.loc[:, 'validation_n'] = pd.qcut(training_set_pos['temp_row_num'], q=5 , labels=bin_lables )

training_set_neg.loc[:, 'temp_row_num'] = training_set_neg.index.to_numpy(dtype=np.int64)

training_set_neg.loc[:, 'validation_n'] = pd.qcut(training_set_neg['temp_row_num'], q=5 , labels=bin_lables )

In [None]:
training_set_neg = training_set_neg.drop(columns=['temp_row_num'])
training_set_pos = training_set_pos.drop(columns=['temp_row_num'])

In [None]:
training_set = pd.concat([training_set_pos, training_set_neg])
test_set = pd.concat([test_set_pos, test_set_neg])

training_set_pos.to_csv('training_set_pos.tsv', sep='\t', index=False, header=False)
test_set_pos.to_csv('test_set_pos.tsv', sep='\t', index=False, header=False)

training_set_neg.to_csv('training_set_neg.tsv', sep='\t', index=False, header=False)
test_set_neg.to_csv('test_set_neg.tsv', sep='\t', index=False, header=False)

training_set.to_csv('training_set.tsv', sep='\t', index=False, header=False)
test_set.to_csv('test_set.tsv', sep='\t', index=False, header=False)

In [None]:
training_set_neg.head()

Unnamed: 0,0,1,2,3,4,validation_n
15381,Q07163,Saccharomyces cerevisiae,Fungi,1755,False,4
10122,Q924K8,Mus musculus,Metazoa,591,False,3
4596,P21145,Homo sapiens,Metazoa,153,True,1
15161,Q9UUM2,Schizosaccharomyces pombe (strain 972 / ATCC 2...,Fungi,971,False,4
5673,Q04781,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,1562,False,2


In [None]:
print(training_set_neg['validation_n'].nunique)

<bound method IndexOpsMixin.nunique of 15381    4
10122    3
4596     1
15161    4
5673     2
        ..
15093    4
4372     1
10545    3
10217    3
19621    5
Name: validation_n, Length: 7147, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]>
