In [None]:
import os
import numpy as np
import h5py
import random

In [2]:
class seq2tensor(object):
    def __init__(self, filename):
        self.t2v = {}
        self.dim = None
        with open(filename, "r") as fin:
            for line in fin:
                line = line.strip().split("\t")
                t = line[0]
                v = np.array([float(x) for x in line[1].split()])
                if self.dim is None:
                    self.dim = len(v)
                else:
                    v = v[:self.dim]
                self.t2v[t] = v

    def embed(self, seq):
        if seq.find(" ") > 0:
            s = seq.strip().split()
        else:
            s = list(seq.strip())
        rst = []
        for x in s:
            v = self.t2v.get(x)
            if v is None:
                continue
            rst.append(v)
        return np.array(rst)

    def embed_normalized(self, seq, seq_size):
        rst = self.embed(seq)
        if len(rst) > seq_size:
            return rst[:seq_size]
        elif len(rst) < seq_size:
            return np.concatenate((rst, np.zeros((seq_size - len(rst), self.dim))))
        return rst

In [None]:
spe = "yeast1"

data_dir = "ppi-data"

seed = 1234
np.random.seed(seed)
random.seed(seed)

seq_file = os.path.join(data_dir, spe, "seq.tsv")
emb_file = os.path.join(data_dir, "embedding/vec5_CTC.txt")
action_file = os.path.join(data_dir, spe, "action.tsv")
embedding_h5 = os.path.join(data_dir, spe, "embedding.h5")


In [4]:
seq2t = seq2tensor(emb_file)
id2seq = {}

with open(seq_file, "r") as fin, h5py.File(embedding_h5, "w") as h5fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        ss = line.split("\t")
        id = ss[0]
        seq = ss[1]
        id2seq[id] = seq
        x = seq2t.embed(seq)
        if id not in h5fout:
            dset = h5fout.require_dataset(
                id,
                shape=x.shape,
                dtype="float32",
                compression="lzf",
            )
            dset[:] = x
