In [8]:
import pandas as pd
import joblib
import pickle
import torch
import numpy as np
import sys
import os

sys.path.append(os.path.abspath('../model'))
sys.path.append(os.path.abspath('../utils'))
sys.path.append(os.path.abspath('../data'))

from tag_extraction import extract_tags, tag_mapping, filter_second_level_by_first_level
from data_preprocessing import process_data, TagDataset
from cnn_model import TextCNN

  from tqdm.autonotebook import tqdm, trange


In [9]:
test_data = pd.read_csv("/content/sample_submission.csv")

In [11]:
# Загрузка моделей
model_first_level = TextCNN(num_classes=30)
model_first_level.load_state_dict(torch.load('model_first_level.pth'))
model_first_level.eval()

model_second_level = TextCNN(num_classes=329)
model_second_level.load_state_dict(torch.load('model_second_level.pth'))
model_second_level.eval()

  model_first_level.load_state_dict(torch.load('model_first_level.pth'))
  model_second_level.load_state_dict(torch.load('model_second_level.pth'))


TextCNN(
  (conv1): Conv1d(768, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(768, 100, kernel_size=(4,), stride=(1,), padding=(2,))
  (conv3): Conv1d(768, 100, kernel_size=(5,), stride=(1,), padding=(2,))
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=329, bias=True)
)

In [12]:
# Загрузка бинаризаторов и сопоставлений
mlb_first_level = joblib.load('mlb_first_level.pkl')
mlb_second_level = joblib.load('mlb_second_level.pkl')
tag_mapping = joblib.load('tag_mapping.pkl')

In [15]:
# test_data['first_level_tags'], test_data['second_level_tags'] = zip(*test_data['tags'].apply(extract_tags))
# process_data(test_data)

In [14]:
process_data(test_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [16]:
# Предсказания для первого уровня
model_first_level.eval()
predictions_first_level_test = []
with torch.no_grad():
    inputs_test = np.vstack(test_data['combined_vector'].values)
    outputs_first_level = model_first_level(torch.tensor(inputs_test, dtype=torch.float32))
    predictions_first_level_test.append((torch.sigmoid(outputs_first_level) > 0.2).cpu().numpy())

predictions_first_level_test = np.vstack(predictions_first_level_test)

# Предсказания для второго уровня
model_second_level.eval()
predictions_second_level_test = []
with torch.no_grad():
    outputs_second_level = model_second_level(torch.tensor(inputs_test, dtype=torch.float32))
    predictions_second_level_test.append((torch.sigmoid(outputs_second_level) > 0.4).cpu().numpy())

predictions_second_level_test = np.vstack(predictions_second_level_test)

# Фильтрация предсказаний второго уровня на основе первого уровня
filtered_predictions_second_level_test = filter_second_level_by_first_level(
    predictions_second_level_test,
    predictions_first_level_test,
    mlb_first_level,
    mlb_second_level,
    tag_mapping
)

predicted_first_level_tags_test = [
    ", ".join(mlb_first_level.classes_[pred.astype(bool)]) for pred in predictions_first_level_test
]

predicted_second_level_tags_test = []
for i, pred in enumerate(filtered_predictions_second_level_test):
    first_level_pred = mlb_first_level.classes_[predictions_first_level_test[i].astype(bool)]
    second_level_pred = mlb_second_level.classes_[pred.astype(bool)]

    # Создаем строки вида [Первый уровень: Второй уровень]
    combined_tags = []

    for fl_tag in first_level_pred:
        if fl_tag in tag_mapping:  # Если тег первого уровня есть в mapping
            # Получаем соответствующие теги второго уровня
            corresponding_second_level_tags = [f"{fl_tag}: {sl_tag}" for sl_tag in second_level_pred if sl_tag in tag_mapping[fl_tag]]
            combined_tags.extend(corresponding_second_level_tags)

            # Добавляем тег первого уровня, если у него нет соответствующих тегов второго уровня
            if not corresponding_second_level_tags:
                combined_tags.append(fl_tag)

    predicted_second_level_tags_test.append(combined_tags)

# Создание итогового DataFrame
result_test_df = pd.DataFrame({
    'video_id': test_data['video_id'].values,
    'predicted_tags': predicted_second_level_tags_test
})

# Устанавливаем video_id как индекс
result_test_df.set_index('video_id', inplace=True)

# Сохранение результата в CSV файл
result_test_df['predicted_tags'] = result_test_df['predicted_tags'].astype(str)
result_test_df[['predicted_tags']].to_csv('submission_test_data.csv')