In [2]:
from pathlib import Path

import numpy as np
import pandas as pd


from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score

# Использовать только процессор.
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# Изменить уровень отображения логов
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

# Корень проекта.
DIR_ROOT = Path.cwd().parent.parent
# Путь к удаленной директории с ресурсами: данные, модели и т.д.
DIR_REMOTE: Path | None = Path('/home/admin/cafa/resources')

if DIR_REMOTE is not None and DIR_REMOTE.exists():
    DIR_RESOURCE = DIR_REMOTE
else:
    DIR_RESOURCE = DIR_ROOT

In [3]:
print("TensorFlow v" + tf.__version__)

TensorFlow v2.13.0


In [6]:
# загружаем эмбединги белков, связанные с метками по индексу
train_df = pd.read_csv(DIR_RESOURCE / 'data/prepared/train_df.csv')
train_df.head()

Unnamed: 0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,Column_10,...,Column_1015,Column_1016,Column_1017,Column_1018,Column_1019,Column_1020,Column_1021,Column_1022,Column_1023,Column_1024
0,0.049488,-0.032935,0.032473,-0.033723,-0.059505,0.075936,-0.04586,-0.104476,-0.072112,0.038806,...,-0.046516,-0.028773,-0.021878,-0.097883,0.056475,0.08095,-0.020938,-0.043532,0.096463,0.07307
1,-0.044616,0.064925,-0.080263,-0.075338,-0.004731,0.025408,-0.024685,-0.016568,-0.03898,-0.03387,...,-0.040173,0.003925,0.00613,0.007362,-0.066848,0.106882,-0.030134,0.026724,0.027879,-0.04843
2,-0.020128,-0.049779,0.007894,-0.000829,-0.047737,0.086453,-0.038107,-0.036379,0.029611,0.045029,...,0.029388,0.008456,0.000697,0.013502,-0.005968,-0.011571,0.005704,-0.036103,0.007693,0.106234
3,-0.007515,0.060628,0.004045,0.027056,-0.021542,0.01038,-0.025064,-0.055834,0.068238,0.027764,...,0.020792,0.023307,0.009009,0.018211,0.02082,-0.031738,0.013279,-0.018357,0.008087,0.010917
4,0.013468,0.041516,0.018435,-0.035595,0.00877,0.018699,-0.015452,-0.038092,-0.038326,-0.012299,...,-0.044742,-0.025432,-0.060886,-0.026915,0.026342,0.017237,0.014731,-0.016861,-0.016272,0.037054


In [7]:
# загружаем подготовленные метки, связанные с эмбедингами по индексу, 
# 1 == принадлежит онтологии, 0 == нет.
lables_df = pd.read_csv(
    DIR_RESOURCE / 'data/prepared/train_lbls_top1500_goterms.csv'
)
lables_df.head()

Unnamed: 0,GO:0005575,GO:0008150,GO:0110165,GO:0003674,GO:0005622,GO:0009987,GO:0043226,GO:0043229,GO:0005488,GO:0043227,...,GO:0034250,GO:0140053,GO:0031345,GO:0098802,GO:0045861,GO:0051783,GO:0031674,GO:0001818,GO:0006874,GO:0016887
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# сокращаем количество белков и меток до 1000, чтобы код выполнялся за минуту
# на этапе тестирования аккуратности модели

lables_df = lables_df.iloc[:1000]
train_df = train_df.iloc[:1000]

# преобразуем в ndarray для работы с кодом evaluate_model()

arr_lables = lables_df.to_numpy()
arr_train = train_df.to_numpy()

In [10]:
# у нас 1000 белков с 1024 признаками для каждого
arr_train.shape

(1000, 1024)

In [71]:
# у нас 1000 меток для каждого белка с указанием принадлежности его к одной из
# 1500 онтологий
arr_lables.shape

(1000, 1500)

### За baseline взята статья [Multi-Label Classification with Deep Learning](https://machinelearningmastery.com/multi-label-classification-with-deep-learning/)

In [11]:
# собираем модель по baseline с Kaggle
def get_model(n_inputs: int, n_outputs: int):
    model = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(input_shape=(n_inputs,)),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.Dense(units=512, activation='relu'),
        tf.keras.layers.Dense(units=n_outputs, activation='sigmoid'),
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),
        loss='binary_crossentropy',
        metrics=['binary_accuracy', tf.keras.metrics.AUC()],
    )
    return model

In [73]:
# оцениваем модель с использованием k-fold cross-validation
def evaluate_model(X: np.ndarray, y: np.ndarray):
	results = []
	n_inputs, n_outputs = X.shape[1], y.shape[1]
	# разделение данных на обучающие и тестовые наборы с использованием повторяющейся K-Fold кросс-валидации. В данном случае, данные будут разделены на 2 фолда и это повторится 2 раза, а также используется фиксированное значение random_state для воспроизводимости.
	cv = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)
	for train_ix, test_ix in cv.split(X):
		# собираем данные исходя из индексов, полученных при разделении набора белков на тест и трейн
		# пример train_ix: list [  1   4   5   7  10  15  18  20  21 ...]
		X_train, X_test = X[train_ix], X[test_ix]
		y_train, y_test = y[train_ix], y[test_ix]
		# получем нашу модель
		model = get_model(n_inputs, n_outputs)
		# тренируем
		model.fit(X_train, y_train, verbose=0, epochs=10)
		# получаем предсказания обученной модели
		y_pred = model.predict(X_test)
		# данные нужно транспонировать, тк судя по коду из документации (внизу) сравнение идем по столбцам, те если первый столбец из предсказаний равен первому столбцу меток, то F1Score дает 1, у нас метки для одного белка идут в виде строки изначально, поэтому транспонировав их в столбец мы как раз сравниваем предсказания по одному и тому же белку с метками реальными
		y_pred_transposed = y_pred.transpose()
		y_test_transposed = y_test.transpose()
		#Работает только на TensorFlow v2.13.0, threshold - после какого порога значения равны 0.5 не входит, а 0.51 уже равно 1, т.е 1 == 0.51 при threshold=0.5 даст True
		metric = tf.keras.metrics.F1Score(threshold=0.5)
		metric.update_state(y_test_transposed, y_pred_transposed)
		result = metric.result()
		acc = result.numpy()
		# изначальный код использовал accuracy_score и оно работало на рандомных цифрах из статьи, но на наших данных стало давать 0, хотя визуально там явно не 0% совпадения
		# acc = accuracy_score(y_test, yhat)
		results.append(acc)
	return results

In [74]:
X = arr_train
y = arr_lables
results = evaluate_model(X, y)
print('Accuracy: %.3f (%.3f)' % (mean(results), std(results)))

Accuracy: 0.338 (0.174)


In [76]:
# тут можно посмотреть как работает RepeatedKFold на наших данных

X = arr_train
cv = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)

for i, (train_index, test_index) in enumerate(cv.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index[:20]}")
    print(f"  Test:  index={test_index[:20]}")

    # X_train, X_test = X[train_index], X[test_index]
    # print(X_train, X_test)

Fold 0:
  Train: index=[ 1  4  5  7 10 15 18 20 21 22 24 25 26 27 28 32 33 36 37 38]
  Test:  index=[ 0  2  3  6  8  9 11 12 13 14 16 17 19 23 29 30 31 34 35 39]
Fold 1:
  Train: index=[ 0  2  3  6  8  9 11 12 13 14 16 17 19 23 29 30 31 34 35 39]
  Test:  index=[ 1  4  5  7 10 15 18 20 21 22 24 25 26 27 28 32 33 36 37 38]
Fold 2:
  Train: index=[ 1  2  5  6  7  9 12 14 15 16 19 20 21 22 23 24 25 26 27 29]
  Test:  index=[ 0  3  4  8 10 11 13 17 18 28 30 36 37 38 39 40 43 47 48 49]
Fold 3:
  Train: index=[ 0  3  4  8 10 11 13 17 18 28 30 36 37 38 39 40 43 47 48 49]
  Test:  index=[ 1  2  5  6  7  9 12 14 15 16 19 20 21 22 23 24 25 26 27 29]


In [63]:
# код из докуметации по использованию F1Score, первый ответ равен 1, потому
# что первый столбец y_true совпадает (по прогу) с первым столбцом y_pred.
# если бы сравнение шло по строкам, то трейти ответ не был бы == 1, '1, 1, 0'
# не равно '0.71, 0.4, 0' - 0.4 ниже порога 0.71

metric = tf.keras.metrics.F1Score(threshold=0.7)
y_true = np.array([[1, 1, 1],
                   [1, 0, 0],
                   [1, 1, 0]], np.int32)
y_pred = np.array([[1, 1, 1],
                   [1, 0, 0],
                   [0.71, 0.4, 0.0]], np.float32)
metric.update_state(y_true, y_pred)
result = metric.result()
result.numpy()

array([1.       , 0.6666667, 1.       ], dtype=float32)