In [1]:
import os

os.environ["REQUESTS_CA_BUNDLE"] = (
    "/Library/Application Support/Netskope/STAgent/data/nscacert.pem"
)
os.environ["SSL_CERT_FILE"] = (
    "/Library/Application Support/Netskope/STAgent/data/nscacert.pem"
)

In [2]:
import pprint
import re
from pathlib import Path

import pandas as pd
from keyphrase_extractors import (
    BaseExtractor,
    EmbeddingModel,
    EmbeddingPrompts,
    KeyBERTBasedExtractor,
    PKEBasedExtractor,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input_text_filepath = Path(
    "../data/不規則な時系列データのモデリング - Neural CDEs の理論の導入部と実装.md"
)

with input_text_filepath.open("r") as f:
    input_text = "".join(f.readlines())
input_text = re.sub(r"\[(.+?)\]\(https://[^\)]+\)", r"\1", input_text)
input_text = re.sub(r"https?://[^\s]+", "", input_text)
print(input_text)

# 不規則な時系列データのモデリング - Neural CDEs の理論の導入部と実装 - ABEJA Tech Blog

こんにちは！ABEJAでデータサイエンティストをしている藤原です。今年の4月に新卒で入社しました！

個人的な趣味で、少し前から「Neural Differential Equations」という分野の勉強を少しずつしているのですが、その中で「Neural Controlled Differential Equations」という研究が面白いなと感じました。そこで、理論の勉強だけじゃなく実際に動かしてみよう！と思い、今回は「Neural Controlled Differential Equations」について、前半では理論の導入部を、後半では具体的な実装と併せて紹介します。

今回の説明や実験で使っている実装の一式は GitHub - flatton/using_NeuralCDE_with_diffrax にて公開しています。

# 目次

- 目次
- はじめに
- 前提知識の補足：微分方程式（Differential Equations）
- Neural Differential Equations; Neural DEs
    - Neural DEs とは？
    - 微分方程式とニューラルネットワークの関係
    - Neural DEs のメリット・デメリット
- Neural Controlled Differential Equations; Neural CDEs
    - Neural CDEs とは？ 〜Neural ODEs との違い〜
    - RNNとの関係
    - Neural CDEs の特徴
- 実装を交えた説明
    - 補間（Interpolation）
    - Neural CDEs
        - 初期値を生成するMLP
        - ベクトル場をパラメータ化したMLP
        - 隠れ状態から出力値を生成するための全結合層
        - Neural CDEs 全体の実装について
    - Neural CDEs をオイラー陽解法で離散化した RNNライクなニューラルネットワーク
        - RNNライクな構造のニューラルネットワークブロ

# Base Class

In [4]:
base = BaseExtractor()
stop_words = base.stop_words

In [5]:
result_df = pd.DataFrame()

# PKE

In [6]:
extractor_pke = PKEBasedExtractor(stop_words=stop_words)

In [7]:
keyphrases = extractor_pke.get_keyphrase(
    input_text=input_text, top_n_phrases=30, max_characters=None
)
result_df["PKE"] = keyphrases[0]
display(result_df)

Unnamed: 0,PKE
0,NeuralCDEs
1,データサイエンティスト
2,微分方程式
3,ニューラルネットワーク
4,NeuralDifferentialEquations
5,データ
6,NeuralDEs
7,モデル
8,RNNライク
9,stepsize


# KeyBERT

In [9]:
embedding_model_config = EmbeddingModel(
    name="cl-nagoya/ruri-base",
    prompts=EmbeddingPrompts(query="クエリ: ", passage="文章: "),
)

extractor_keybert = KeyBERTBasedExtractor(
    model_config=embedding_model_config,
    stop_words=stop_words,
    batchsize=32,
    show_progress_bar=False,
)
keyphrases = extractor_keybert.get_keyphrase(
    input_text=input_text,
    max_characters=None,
    diversity_mode="normal",  # "normal", "use_maxsum", "use_mmr"
    top_n_phrases=30,
    max_filtered_phrases=30,
    max_filtered_sentences=30,
    threshold=None,
    nr_candidates=30,
    diversity=0.7,
    filter_sentences=False,
    phrasing=False,
)
result_df[f"{embedding_model_config.name}_vanilla"] = keyphrases[0]

keyphrases = extractor_keybert.get_keyphrase(
    input_text=input_text,
    max_characters=None,
    diversity_mode="normal",  # "normal", "use_maxsum", "use_mmr"
    top_n_phrases=30,
    max_filtered_phrases=30,
    max_filtered_sentences=30,
    threshold=None,
    nr_candidates=30,
    diversity=0.7,
    filter_sentences=False,
    phrasing=True,
)
result_df[f"{embedding_model_config.name}_phrasing"] = keyphrases[0]

keyphrases = extractor_keybert.get_keyphrase(
    input_text=input_text,
    max_characters=None,
    diversity_mode="normal",  # "normal", "use_maxsum", "use_mmr"
    top_n_phrases=30,
    max_filtered_phrases=30,
    max_filtered_sentences=30,
    threshold=None,
    nr_candidates=30,
    diversity=0.7,
    filter_sentences=True,
    phrasing=False,
)
result_df[f"{embedding_model_config.name}_filtering_sentence"] = keyphrases[0]

keyphrases = extractor_keybert.get_keyphrase(
    input_text=input_text,
    max_characters=None,
    diversity_mode="normal",  # "normal", "use_maxsum", "use_mmr"
    top_n_phrases=30,
    max_filtered_phrases=30,
    max_filtered_sentences=30,
    threshold=None,
    nr_candidates=30,
    diversity=0.7,
    filter_sentences=True,
    phrasing=True,
)
result_df[f"{embedding_model_config.name}_phrasing_and_filtering_sentence"] = (
    keyphrases[0]
)
display(result_df)

Unnamed: 0,PKE,cl-nagoya/ruri-base_vanilla,cl-nagoya/ruri-base_phrasing,cl-nagoya/ruri-base_filtering_sentence,cl-nagoya/ruri-base_phrasing_and_filtering_sentence
0,NeuralCDEs,NeuralCDE,"""NeuralOrdinaryDifferentialEquations""",不規則,時系列データ
1,データサイエンティスト,ニューラル,"""Neuralcontrolleddifferentialequationsforirreg...",時系列,NeuralControlledDifferentialEquations
2,微分方程式,深層学習,|離散化NeuralCDE,理論,不規則
3,ニューラルネットワーク,学習,"""Neuralordinarydifferentialequations",Equations,時系列
4,NeuralDifferentialEquations,Training,論文NeuralOrdinaryDifferentialEquations,勉強,NeuralDifferentialEquations
5,データ,勉強,"""Onneuraldifferentialequations",モデリング,理論
6,NeuralDEs,線形,離散化NeuralCDEs,導入部,NeuralCDEs
7,モデル,CDEs,離散化NeuralCDE,CDEs,NeuralDEs
8,RNNライク,微分,"""Neuralcontrolleddifferentialequationsforonlin...",今回,勉強
9,stepsize,基礎,NeuralControlledDifferentialEquations,ニューラル,導入部


In [10]:
result_df.to_csv("./01_result.csv", index=False)

In [11]:
embedding_models: list[EmbeddingModel] = [
    # EmbeddingModel(name="BAAI/bge-m3"),
    EmbeddingModel(name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"),
    EmbeddingModel(name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"),
    # EmbeddingModel(name="Alibaba-NLP/gte-multilingual-base"),
    EmbeddingModel(name="bclavie/JaColBERTv2"),
    EmbeddingModel(name="answerdotai/JaColBERTv2.5"),
    EmbeddingModel(
        name="intfloat/multilingual-e5-large",
        prompts=EmbeddingPrompts(query="query: ", passage="passage: "),
    ),
    EmbeddingModel(
        name="intfloat/multilingual-e5-base",
        prompts=EmbeddingPrompts(query="query: ", passage="passage: "),
    ),
    EmbeddingModel(
        name="pkshatech/GLuCoSE-base-ja-v2",
        prompts=EmbeddingPrompts(query="query: ", passage="passage: "),
    ),
    EmbeddingModel(
        name="pkshatech/RoSEtta-base-ja",
        prompts=EmbeddingPrompts(query="query: ", passage="passage: "),
    ),
    EmbeddingModel(
        name="cl-nagoya/ruri-large",
        prompts=EmbeddingPrompts(query="クエリ: ", passage="文章: "),
    ),
    EmbeddingModel(
        name="cl-nagoya/ruri-base",
        prompts=EmbeddingPrompts(query="クエリ: ", passage="文章: "),
    ),
]

In [14]:
compare_models_df = pd.DataFrame()
for embedding_model_config in embedding_models:
    extractor_keybert = KeyBERTBasedExtractor(
        model_config=embedding_model_config,
        stop_words=stop_words,
        batchsize=32,
        show_progress_bar=False,
    )

    keyphrases = extractor_keybert.get_keyphrase(
        input_text=input_text,
        max_characters=None,
        diversity_mode="normal",  # "normal", "use_maxsum", "use_mmr"
        top_n_phrases=30,
        max_filtered_phrases=30,
        max_filtered_sentences=30,
        threshold=None,
        nr_candidates=30,
        diversity=0.7,
        filter_sentences=True,
        phrasing=True,
    )
    compare_models_df[f"{embedding_model_config.name}"] = keyphrases[0]
display(compare_models_df)

No sentence-transformers model found with name bclavie/JaColBERTv2. Creating a new one with mean pooling.
No sentence-transformers model found with name answerdotai/JaColBERTv2.5. Creating a new one with mean pooling.
Some weights of the model checkpoint at answerdotai/JaColBERTv2.5 were not used when initializing BertModel: ['linear.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,bclavie/JaColBERTv2,answerdotai/JaColBERTv2.5,intfloat/multilingual-e5-large,intfloat/multilingual-e5-base,pkshatech/GLuCoSE-base-ja-v2,pkshatech/RoSEtta-base-ja,cl-nagoya/ruri-large,cl-nagoya/ruri-base
0,NeuralControlledDifferentialEquations,NeuralControlledDifferentialEquations,NeuralControlledDifferentialEquations,微分方程式,NeuralCDEs,NeuralCDEs,ABEJATechBlog,微分方程式,時系列データ,時系列データ
1,NeuralDifferentialEquations,NeuralDifferentialEquations,Differential,tools,ABEJATechBlog,NeuralDEs,NeuralControlledDifferentialEquations,詳しく,時系列,NeuralControlledDifferentialEquations
2,"""NeuralOrdinaryDifferentialEquations""",NeuralCDEs,ControlledDifferentialEquations,NeuralDEs,CDEs,ABEJATechBlog,Blog,微分,NeuralControlledDifferentialEquations,不規則
3,NeuralCDEs,Neural,Equations,NeuralControlledDifferentialEquations,NeuralDifferentialEquations,時系列データ,実装,ベクトル場,データ,時系列
4,Neural,実装,NeuralDEs,Diffrax,NeuralDEs,NeuralControlledDifferentialEquations,Equations,説明,理論,NeuralDifferentialEquations
5,NeuralDEs,勉強,OrdinaryDifferentialEquations,方程式,時系列データ,モデリング,NeuralCDEs,方程式,NeuralDEs,理論
6,Differential,理論,Controlled,DEs,NeuralControlledDifferentialEquations,離散化NeuralCDEs,微分方程式,具体的,実際,NeuralCDEs
7,NeuralDEs全体,時系列データ,Neural,Differential,ABEJA,Equations,GitHub,実装,NeuralDifferentialEquations,NeuralDEs
8,Equations,"""NeuralOrdinaryDifferentialEquations""",導入部,NeuralCDEs,NeuralCDEs全体,実装方法,ABEJA,CubicInterpolation,不規則,勉強
9,論文,研究,NeuralDifferentialEquations,参考,実装方法,Neural,具体的,NeuralControlledDifferentialEquations,導入部,導入部


In [15]:
compare_models_df.to_csv("./02_compare_models.csv", index=False)

In [19]:
import csv

for filepath in Path("./").glob("*.csv"):
    res = ""
    with filepath.open(mode="r") as f:
        csvreader = csv.reader(f, delimiter=",")
        head = True
        for row in csvreader:
            res += "|"
            for cell in row:
                res += " " + cell + " |"
            res += "\n"
            if head:
                res += "|" + " --- |" * len(row) + "\n"
            head = False

    output_filepath = filepath.parent / (f"hatena_{filepath.stem}.txt")
    with output_filepath.open(mode="w") as f:
        f.write(res)