# Time and memory consumption

In this notebook, time and memory consumption is calculated based on the number of labelled training instances available. We use the avGFP dataset to randomly select 100, 1000 and 10000 training instances. For each method (the best methods of each strategy) we calculate the training time and peak memory in each case.



In [1]:
import pickle as pk
import numpy as np
import pandas as pd

import os 
import time 
import tracemalloc

import random

from sklearn.model_selection import train_test_split

### DCA encoding

In [2]:
Xl_dcae = pk.load(open('datasets/avgfp_Xl_dcae.pk', 'rb'))
print(Xl_dcae.shape)
Xu_dcae = pk.load(open('datasets/avgfp_Xu_dcae.pk', 'rb'))
print(Xu_dcae.shape)

indexes = pk.load(open(f'datasets/avgfp_indexes.pk', 'rb'))
wild_type = pk.load(open(f'datasets/avgfp_wt_dcae.pk', 'rb'))

(30466, 210)
(697, 210)


### PAM250 encoding

In [3]:
Xl_pam250 = pk.load(open('datasets/avgfp_Xl_pam250.pk', 'rb'))
Xl_pam250 = Xl_pam250.reshape((Xl_pam250.shape[0], -1))[indexes]
print(Xl_pam250.shape)
Xu_pam250 = pk.load(open('datasets/avgfp_Xu_pam250.pk', 'rb'))
Xu_pam250 = Xu_pam250.reshape((Xu_pam250.shape[0], -1))
print(Xu_pam250.shape)

(30466, 4700)
(697, 4700)


### Unirep encoding

In [4]:
Xl_unirep = pk.load(open('datasets/avgfp_Xl_unirep.pk', 'rb'))
Xl_unirep = Xl_unirep.reshape((Xl_unirep.shape[0], -1))[indexes]
print(Xl_unirep.shape)

(30466, 1900)


### eUnirep encoding

In [5]:
Xl_eunirep = pk.load(open('datasets/avgfp_Xl_eunirep.pk', 'rb'))
Xl_eunirep = Xl_eunirep.reshape((Xl_eunirep.shape[0], -1))[indexes]
print(Xl_eunirep.shape)

(30466, 1900)


### Target y

In [6]:
y = pk.load(open(f'datasets/avgfp_y_dcae.pk', 'rb'))
y_cat = np.where(y >= np.percentile(y, 75), 1, 0)
y.shape

(30466,)

# Strategy 0: Unirep + Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

Xl = Xl_unirep
estimator = RandomForestRegressor()

for n in [100, 1000, 10000]: 
    
    print(f'N: {n}')
    test_size = 1-(n/len(y))
    ind = [i for i in range(len(Xl))]
    n_indexes, _, _, _ = train_test_split(ind, 
                                         y_cat, 
                                         test_size=test_size, 
                                         random_state=1234, 
                                         stratify=y_cat)
    Xl_train = Xl[n_indexes]
    y_train = y[n_indexes]
    
    t_start = time.time()
    tracemalloc.start()
    
    estimator.fit(Xl_train, y_train)
    
    t_end = time.time()
    print(f'time: {t_end-t_start}')
    print(f'mem: {tracemalloc.get_traced_memory()}')
    tracemalloc.stop()

N: 100
time: 4.1439597606658936
mem: (124175, 915166)
N: 1000
time: 113.55108332633972
mem: (59548, 7733214)
N: 10000
time: 2300.7957031726837
mem: (57727, 76388462)


# Strategy 1: DCA + SVM

In [8]:
from sklearn.svm import SVR

Xl = Xl_dcae
estimator = SVR()

for n in [100, 1000, 10000]: 
    
    print(f'N: {n}')
    test_size = 1-(n/len(y))
    ind = [i for i in range(len(Xl))]
    n_indexes, _, _, _ = train_test_split(ind, 
                                         y_cat, 
                                         test_size=test_size, 
                                         random_state=1234, 
                                         stratify=y_cat)
    Xl_train = Xl[n_indexes]
    y_train = y[n_indexes]
    
    t_start = time.time()
    tracemalloc.start()
    
    estimator.fit(Xl_train, y_train)
    
    t_end = time.time()
    print(f'time: {t_end-t_start}')
    print(f'mem: {tracemalloc.get_traced_memory()}')
    tracemalloc.stop()

N: 100
time: 0.00244903564453125
mem: (165310, 240103)
N: 1000
time: 0.09348249435424805
mem: (1354213, 1747030)
N: 10000
time: 12.046603679656982
mem: (12783728, 16866998)


# Strategy 2: DCA + MERGE[SVM]

In [9]:
from models.MERGE_v2 import Merge
from sklearn.svm import SVR

Xl = Xl_dcae
estimator = SVR()

for n in [100, 1000, 10000]: 
    
    print(f'N: {n}')
    test_size = 1-(n/len(y))
    ind = [i for i in range(len(Xl))]
    n_indexes, _, _, _ = train_test_split(ind, 
                                         y_cat, 
                                         test_size=test_size, 
                                         random_state=1234, 
                                         stratify=y_cat)
    Xl_train = Xl[n_indexes]
    y_train = y[n_indexes]
    
    t_start = time.time()
    tracemalloc.start()
    
    merge = Merge(wild_type=wild_type, base_regressor=SVR())
    merge.fit(Xl_train, y_train)
    
    t_end = time.time()
    print(f'time: {t_end-t_start}')
    print(f'mem: {tracemalloc.get_traced_memory()}')
    tracemalloc.stop()

N: 100
time: 0.7134420871734619
mem: (290839, 305037)
N: 1000
time: 0.8090908527374268
mem: (1400290, 1756136)
N: 10000
time: 29.692567586898804
mem: (12958994, 16948136)


# Strategy 3: PAM250 + TriTR[RF]

In [12]:
from models.TriTrainingRegressor import TriTrainingRegressor
from sklearn.ensemble import RandomForestRegressor

Xl = Xl_pam250
estimator = RandomForestRegressor()

for n in [100, 1000, 10000]: 
    
    print(f'N: {n}')
    test_size = 1-(n/len(y))
    ind = [i for i in range(len(Xl))]
    n_indexes, _, _, _ = train_test_split(ind, 
                                         y_cat, 
                                         test_size=test_size, 
                                         random_state=1234, 
                                         stratify=y_cat)
    Xl_train = Xl[n_indexes]
    y_train = y[n_indexes]
    Xl_train_tritr = np.concatenate((Xl_train, Xu_pam250))
    y_train_tritr = np.concatenate((y_train, np.full(Xu_pam250.shape[0], None)))
    
    t_start = time.time()
    tracemalloc.start()
    
    tritr = TriTrainingRegressor(base_estimator=estimator)
    tritr.fit(Xl_train_tritr, y_train_tritr)
    
    t_end = time.time()
    print(f'time: {t_end-t_start}')
    print(f'mem: {tracemalloc.get_traced_memory()}')
    tracemalloc.stop()

N: 100
time: 10.523196458816528
mem: (35849377, 93909205)
N: 1000
time: 385.058908700943
mem: (363348, 159119623)
N: 10000
time: 20368.903308153152
mem: (343946, 1344709988)


# Strategy 4: DCA + TriTr[SVM]

In [13]:
from models.TriTrainingRegressor import TriTrainingRegressor
from sklearn.svm import SVR

Xl = Xl_dcae
estimator = SVR()

for n in [100, 1000, 10000]: 
    
    print(f'N: {n}')
    test_size = 1-(n/len(y))
    ind = [i for i in range(len(Xl))]
    n_indexes, _, _, _ = train_test_split(ind, 
                                         y_cat, 
                                         test_size=test_size, 
                                         random_state=1234, 
                                         stratify=y_cat)
    Xl_train = Xl[n_indexes]
    y_train = y[n_indexes]
    Xl_train_tritr = np.concatenate((Xl_train, Xu_dcae))
    y_train_tritr = np.concatenate((y_train, np.full(Xu_dcae.shape[0], None)))
    
    t_start = time.time()
    tracemalloc.start()
    
    tritr = TriTrainingRegressor(base_estimator=estimator)
    tritr.fit(Xl_train_tritr, y_train_tritr)
    
    t_end = time.time()
    print(f'time: {t_end-t_start}')
    print(f'mem: {tracemalloc.get_traced_memory()}')
    tracemalloc.stop()

N: 100
time: 0.02988266944885254
mem: (444128, 2032524)
N: 1000
time: 3.310210943222046
mem: (4096276, 12064677)
N: 10000
time: 235.62896537780762
mem: (38388233, 106986605)
