In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

'''Сначала мы загрузим файл train_terms.tsv, который содержит список аннотированных терминов (функций) для белков. Извлечем метки, также известные как идентификатор термина GO, и создадим фрейм данных меток для встраивания белков.'''
train_terms = pd.read_csv("~/cafa/resources/data/raw/Train/train_terms.tsv",sep="\t")
print(train_terms.shape)
train_terms.head()

2023-07-25 21:31:14.805383: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-25 21:31:14.913528: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


(5363863, 3)


Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO
2,A0A009IHW8,GO:0072523,BPO
3,A0A009IHW8,GO:0044270,BPO
4,A0A009IHW8,GO:0006753,BPO


In [4]:
'''Загрузим предварительно рассчитанные встраивания белков, созданные Сергеем Фироновым. Встраивания белков, которые будут использоваться для обучения, записываются в train_embeds.npy, а соответствующие идентификаторы белков доступны в train_ids.npy.'''

'''загрузим идентификаторы белков для встраивания белков в наборе данных поезда, содержащемся в train_ids.npy, в массив numpy.'''

train_protein_ids = np.load('/home/admin/cafa/resources/data/interim/t5embeds/train_ids.npy')
print(train_protein_ids.shape)

(142246,)


In [5]:
'''Преобразуем в фрейм данных Pandas. Каждое встраивание белка представляет собой вектор длиной 1024. Создаем результирующий dataframe таким образом, чтобы было 1024 столбца для представления значений в каждом из 1024 мест в векторе.'''


train_embeddings = np.load('/home/admin/cafa/resources/data/interim/t5embeds/train_embeds.npy')

# Now lets convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = train_embeddings.shape[1]
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(train_df.shape)

# X = train_df

(142246, 1024)


In [6]:
num_of_labels = 1500

# Выбираем первые 1500 идентификаторов терминов GO в качестве меток.
labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()
train_terms_updated = train_terms.loc[train_terms['term'].isin(labels)]
train_size = train_protein_ids.shape[0] # len(X)
train_labels = np.zeros((train_size ,num_of_labels))

#from numpy to pandas для лучшей обработки
series_train_protein_ids = pd.Series(train_protein_ids)

# перебираем
for i in range(num_of_labels):
    # извлекаем соответствующие данные train_terms
    n_train_terms = train_terms_updated[train_terms_updated['term'] ==  labels[i]]
    
    # Получаем данные train_terms только для соответствующих label(GO term ID)
    label_related_proteins = n_train_terms['EntryID'].unique()
    
    # Если белок связан с меткой, то 1, иначе 0.
    # Заменяем i-й столбец train_Y   на pandas.
    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    
# Преобразование train_Y numpy в pandas dataframe
labels_df = pd.DataFrame(data = train_labels, columns = labels)
print(labels_df.shape)

(142246, 1500)


In [7]:
labels_df.head()
# Y = labels_df

Unnamed: 0,GO:0005575,GO:0008150,GO:0110165,GO:0003674,GO:0005622,GO:0009987,GO:0043226,GO:0043229,GO:0005488,GO:0043227,...,GO:0034250,GO:0140053,GO:0031345,GO:0098802,GO:0045861,GO:0051783,GO:0031674,GO:0001818,GO:0006874,GO:0016887
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# объявляем классификатор RandomForest, не внося в него никаких параметров
clf = DecisionTreeClassifier()
# Записываем параметры
parametrs = {'min_samples_split': range (2,800,10) }


In [9]:
# Запускаем обучение
scoring = { 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

gs = GridSearchCV(clf,
                  param_grid=parametrs,
                  scoring=scoring, refit='AUC', return_train_score=True)

gs.fit(train_df, labels_df)
results = gs.cv_results_

In [None]:
# Выводим результат
gs.best_params_