In [1]:
import multiprocessing
import os
import pickle
import random

import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from embeddings_reproduction import embedding_tools

In [20]:
def train(X, k, window, modeldir_path): # X是样本名称
    name_list = [X, str(k), str(window)]
    if os.path.isfile('_'.join(name_list) + '.pkl'):
        return
    print('X\t\tk\twindow')
    print(name_list[0] + '\t\t' + '\t'.join(name_list[1:]))
    kmer_hypers = {'k':k, 
                   'overlap':False,
                   'merge':False}
    model_hypers = {'vector_size': 64,
                    'min_count': 0,
                    'epochs': 25,
                    'window':window,
                    'workers': 4} # 添加随机种子
    documents = embedding_tools.Corpus(sequence_dict[X], kmer_hypers)
    print(documents)
    model = Doc2Vec(**model_hypers)
    model.build_vocab(documents)
    model.train(documents,total_examples=model.corpus_count, epochs=model.epochs)
    model.save(modeldir_path + '_'.join(name_list) + '.pkl')

In [4]:
def infer_vectors(df, model, k, dest_file, overlap=False, method=None):
    # df = pd.read_csv(data)
    seqs = embedding_tools.get_seqs(df)
    if method is not None:
        seqs = embedding_tools.randomize_seqs(seqs, method=method) # 随机化序列
    embeds = embedding_tools.get_embeddings_new(model, seqs, k=k,
                                                overlap=overlap)
    embeds = pd.DataFrame(embeds, index=df.index)
    terms = list(range(embeds.shape[1]))
    name = model.split('\\')[-1]
    with open(dest_file + 'X_' + name, 'wb') as f:
        pickle.dump((embeds, terms), f)

# 开始训练任务

In [17]:
mut_path ='H:\BLOOD_Task\mixdata\multi-ISTH-RJ.csv' 
mutdata = pd.read_csv(mut_path)
mutsites = mutdata['Amino Acid Substitution']
fstdir_path = 'H:\BLOOD_Task\mixdata\FASTA'

def get_fasta(mutsites,fstdir_path):
    sequence = []
    sequence_dict = {}
    for mutsite in mutsites:
        fasta_path = fstdir_path + '\\vWF_' + mutsite + '.fasta'
        with open(fasta_path, 'r') as opf:
            sequence.append(opf.read().split('\n')[1])
    sequence_dict['vWF'] = pd.DataFrame({'mutsite':mutsites,'sequence':sequence})
    return sequence_dict     

In [18]:
sequence_dict = get_fasta(mutsites,fstdir_path)
sequence_dict

{'vWF':     mutsite                                           sequence
 0      G19R  MIPARFAGVLLALALILPRTLCAEGTRGRSSTARCSLFGSDFVNTF...
 1      D47H  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 2      S49R  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 3      S85P  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 4     L129M  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 ..      ...                                                ...
 288  L2617M  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 289  P2628P  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 290  L2702P  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 291  G2705R  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 292  C2750Y  MIPARFAGVLLALALILPGTLCAEGTRGRSSTARCSLFGSDFVNTF...
 
 [293 rows x 2 columns]}

In [22]:
X = 'vWF'
modeldir_path = 'H:\BLOOD_Task\mixdata\doc\\vwF_model\\'
for k in range(1,2):
    for window in range(1,8):
        train(X,k,window,modeldir_path)

X		k	window
vWF		1	1
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACB56BBE0>
X		k	window
vWF		1	2
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACC792B80>
X		k	window
vWF		1	3
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACC010E50>
X		k	window
vWF		1	4
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACC792B80>
X		k	window
vWF		1	5
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACB7A3400>
X		k	window
vWF		1	6
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACB795FD0>
X		k	window
vWF		1	7
<embeddings_reproduction.embedding_tools.Corpus object at 0x0000015ACC792B80>


In [104]:
models = os.listdir('H:\BLOOD_Task\mixdata\doc\\vwF_model\\')
models = [m for m in models if m[-3:] == 'pkl']

In [105]:
dest = 'H:\BLOOD_Task\mixdata\doc\\' + X + '_embeddings\\'
for model in models:
    k = int(model[-7])
    print('Inferring...')
    infer_vectors(sequence_dict['vWF'], 'H:\BLOOD_Task\mixdata\doc\\vwF_model\\'+ model, k, dest)

Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...
Inferring...


# 带入多标签的训练模型尝试训练

In [23]:
import pickle

path='H:\BLOOD_Task\mixdata\doc\\vwF_embeddings\X_vWF_1_1.pkl'   # pkl文件所在路径,注意：应是多个文件

with open(path,'rb') as f:
    data = pickle.load(f)    # data[0]是要的数据

data

(           0         1         2         3         4         5         6   \
 0   -0.067734  0.043939  0.050996 -0.078280  0.043260  0.134887  0.019991   
 1   -0.027166  0.150776 -0.006376 -0.084445  0.033147  0.126106 -0.012924   
 2   -0.080785  0.040340  0.017973 -0.072143  0.047508  0.138301  0.021396   
 3   -0.029599  0.175719 -0.011658 -0.047247  0.035556  0.135953 -0.003931   
 4   -0.095703  0.073214 -0.003976 -0.025609  0.029893  0.127486 -0.015924   
 ..        ...       ...       ...       ...       ...       ...       ...   
 288 -0.086131  0.144778 -0.015277 -0.076415  0.060201  0.142823 -0.009647   
 289 -0.122592  0.124922  0.005250 -0.040201  0.053257  0.126796  0.027786   
 290 -0.079587  0.089508 -0.029105 -0.054867  0.042508  0.112346  0.007114   
 291 -0.128343  0.096922  0.020772 -0.039588  0.093079  0.095187 -0.003901   
 292 -0.046974  0.116484  0.009552 -0.091489  0.083511  0.076209 -0.012418   
 
            7         8         9   ...        54        55   

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd 

num = data[0].index
trainval_idx,test_idx = train_test_split(num,test_size=0.2,random_state=0)
train_idx,valid_idx = train_test_split(trainval_idx,test_size=0.2,random_state=0)

mut_path ='H:\BLOOD_Task\mixdata\multi-ISTH-RJ.csv' 
mutdata = pd.read_csv(mut_path)
target = pd.DataFrame({'class_0':mutdata['class_0'],
                       'class_1':mutdata['class_1'],
                       'class_2':mutdata['class_2'],
                       'class_3':mutdata['class_3'],
                       'class_4':mutdata['class_4'],
                       'class_5':mutdata['class_5']}) # Y


X_trainval = data[0].loc[trainval_idx]
X_test = data[0].loc[test_idx]

y_trainval = target.loc[trainval_idx]
y_test = target.loc[test_idx]

In [20]:
import os 
import pickle
from sklearn.model_selection import cross_val_score
from skmultilearn.adapt import MLkNN
import numpy as np 
from scipy import sparse
import tqdm
import warnings
warnings.filterwarnings('ignore') #忽略不重要的warning

veclst = os.listdir('H:\BLOOD_Task\mixdata\doc\\vwF_embeddings\\')
test_score = []

for vec in tqdm.tqdm(veclst):
    vec_path = 'H:\BLOOD_Task\mixdata\doc\\vwF_embeddings\\' + str(vec)
    with open(vec_path,'rb') as f:
        data = pickle.load(f)
    num = data[0].index
    trainval_idx,test_idx = train_test_split(num,test_size=0.2,random_state=0)
    train_idx,valid_idx = train_test_split(trainval_idx,test_size=0.2,random_state=0)

    X_trainval = data[0].loc[trainval_idx]
    X_test = data[0].loc[test_idx]

    y_trainval = sparse.lil_matrix(target.loc[trainval_idx])
    y_test = sparse.lil_matrix(target.loc[test_idx])

    model_name = 'MLKNN'
    best_score = 0
    best_parameters ={}
    for k in range(1,6):
        for s in [0.000001,0.0001,0.01,1,100,1000]:
            mlknn = MLkNN(k=k,s=s)
            scores = cross_val_score(mlknn,X_trainval, y_trainval,cv=5,scoring='f1_samples')
            score = np.mean(scores)
            if score > best_score:
                best_score = score
                best_parameters = {'k':k,'s':s}

    mlknn = MLkNN(**best_parameters)
    mlknn.fit(X_trainval,y_trainval)
    test_score.append([vec[:-4],model_name,mlknn.score(X_test,y_test),best_parameters])

100%|██████████| 35/35 [08:02<00:00, 13.78s/it]


In [None]:
from gpmodel import gpmodel  # ?? What's this from?
from gpmodel import gpkernel
from gpmodel import gptools