# Base Vector Caching (参照データのベクトル化と保存)
リアルタイム推論を実現するため、判定の基準となるデータセット（Safe/Out）を事前にBERTでベクトル化し、高速に読み込み可能なPickle形式で保存します。これにより、デモ実行時の待機時間を大幅に短縮します。

## 1. Setup & Model Loading
環境構築と、特徴量抽出に使用する事前学習済みBERTモデルの準備を行います。

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel
import os
import pickle

# BERTモデルとトークナイザーの初期化
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=512)
model = BertModel.from_pretrained('bert-base-uncased')

# 計算リソースの設定（GPU優先）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

## 2. Mean Pooling Vectorizer
文全体の意味を凝縮した分散表現を得るため、BERTの各トークン出力を平均化するMean Poolingを実装します。

In [None]:
def get_vectors(file):
  """
  CSVファイルを読み込み、BERT + Mean Poolingで文ベクトルを抽出する。
  """
  df_workbook = pd.read_csv(file + '.csv')
  # 欠損値対策として文字列型に変換
  df_workbook['cleaned_text'] = df_workbook['cleaned_text'].astype(str)
  vectors = []

  texts = df_workbook['cleaned_text'].tolist()
  severe = df_workbook['severe_toxic'].tolist()
  obscene = df_workbook['obscene'].tolist()
  threat = df_workbook['threat'].tolist()
  insult = df_workbook['insult'].tolist()
  identity = df_workbook['identity_hate'].tolist()

  batch_size = 64
  model.eval()

  with torch.no_grad():
    for i in tqdm(range(0, len(df_workbook), batch_size)):
      batch_texts = df_workbook['cleaned_text'].iloc[i : i + batch_size].tolist()
      tmp = tokenizer(batch_texts, truncation=True, padding=True, max_length=512, return_tensors='pt').to(device)
      outputs = model(**tmp)
      token_embeddings = outputs[0]
      input_mask_expanded = tmp['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
      # Mean Poolingの実行
      batch_vectors = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
      vectors.extend(batch_vectors.cpu().numpy())

  return vectors, texts, severe, obscene, threat, insult, identity

## 3. Data Persistence (Pickle Serialization)
抽出したベクトルをバイナリ形式（Pickle）で保存します。CSVと異なり、型情報を保持したまま高速にロードすることが可能です。

In [None]:
# 各データセットのベクトル化実行
safe_test_vec, safe_test_text, safe_test_severe, safe_test_obscene, safe_test_threat, safe_test_insult, safe_test_identity = get_vectors("test_safe")
out_test_vec, out_test_text, out_test_severe, out_test_obscene, out_test_threat, out_test_insult, out_test_identity = get_vectors("test_out")
safe_train_vec, safe_train_text, safe_train_severe, safe_train_obscene, safe_train_threat, safe_train_insult, safe_train_identity = get_vectors("train_safe")
out_train_vec, out_train_text, out_train_severe, out_train_obscene, out_train_threat, out_train_insult, out_train_identity = get_vectors("train_out")

# データの統合
safe_vec_all = safe_test_vec + safe_train_vec
out_vec_all = out_test_vec + out_train_vec
safe_txt_all = safe_test_text + safe_train_text
out_txt_all = out_test_text + out_train_text
safe_severe_all = safe_test_severe + safe_train_severe
out_severe_all = out_test_severe + out_train_severe
safe_obscene_all = safe_test_obscene + safe_train_obscene
out_obscene_all = out_test_obscene + out_train_obscene
safe_threat_all = safe_test_threat + safe_train_threat
out_threat_all = out_test_threat + out_train_threat
safe_insult_all = safe_test_insult + safe_train_insult
out_insult_all = out_test_insult + out_train_insult
safe_identity_all = safe_test_identity + safe_train_identity
out_identity_all = out_test_identity + out_train_identity
vec_data = {
    'safe_vec_all': safe_vec_all,
    'out_vec_all': out_vec_all
}
texts_data ={
    'safe_txt_all': safe_txt_all,
    'out_txt_all': out_txt_all,
    "safe_severe_all": safe_severe_all,
    "out_severe_all": out_severe_all,
    "safe_obscene_all": safe_obscene_all,
    "out_obscene_all": out_obscene_all,
    "safe_threat_all": safe_threat_all,
    "out_threat_all": out_threat_all,
    "safe_insult_all": safe_insult_all,
    "out_insult_all": out_insult_all,
    "safe_identity_all": safe_identity_all,
    "out_identity_all": out_identity_all
}

# バイナリファイルとして保存
with open("bert_vectors_data.pkl", 'wb') as f:
        pickle.dump(vec_data, f)
with open("texts_label_data.pkl", 'wb') as f:
        pickle.dump(texts_data, f)