# ResearChainを使った自動研究のデモ
- https://miro.com/app/board/uXjVK44N9U8=/
- 上記のアーキテクチャの一部を実装

<a href="https://colab.research.google.com/github/auto-res/researchchain/blob/develop-tanaka/examples/researchchain_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install --upgrade -q researchgraph

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.8/367.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.9/866.9 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from researchchain.llm_component.llm_component import LLMComponent
from researchchain.retriever_component.semantic_scholar import SemanticScholarRetriever
from researchchain.retriever_component.github import GithubRetriever

import os
import pprint
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
#os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
#os.environ["ANTHROPIC_API_KEY"] = userdata.get('ANTHROPIC_API_KEY')

llm_name = "gpt-4o-2024-08-06"
#llm_name = "gpt-4o-2024-05-13"
#llm_name =  "gpt-4o-mini-2024-07-18"
#llm_name = "gpt-4-turbo-2024-04-09"
#llm_name = "gpt-4-0125-preview"
#llm_name = "gemini-1.0-pro"
#llm_name = "gemini-1.5-pro"
#llm_name = "gemini-1.5-flash"
#llm_name = "claude-3-5-sonnet-20240620"
#llm_name = "claude-3-opus-20240229"

# Memory

In [None]:
memory = {
    "environment" : 
    """
    The following two experimental environments are available
    ・Fine tuning of the LLM and experiments with rewriting the Optimizer or loss function.
    ・Verification of the accuracy of prompt engineering.
    """,
    "objective" : 
    """
    Batch Size Grokking: Assessing the impact of the training batchsize on the grokking phenomenon. Modify the experiments to dynamically adjust the batch size during training, starting with a small batch size and gradually increasing it. This could potentially lead to faster generalization on the validation set.
    """
    ,
}

# 各Componentの処理

- Keyworder1

In [None]:
# 設定
json_data = {
    "input" : [
        ["environment","objective"],
        ["environment","objective", "keywords_mid_thought_1"]
    ],
    "output" : [
        ["keywords_mid_thought_1"],
        ["keywords_1"]
    ],
    "prompt" : [
         "<RULE>You have to think of a 5 KEYWORDs regarding academic search.</RULE><RULE role=\"assistant\">There is a ojbective and limitation that we can handle, so you have to first interpret what the objective really means in keyword search.Answer should be surrounded with <keywords_mid_thought_1></keywords_mid_thought_1> tag.</RULE><objective>\n{objective}\n</objective><environment>\n{environment}\n</environment><EOS></EOS>",
         "<RULE>You have to think of a 5 KEYWORDs in in JSON format.</RULE><RULE role=\"assistant\">Read all the information and make a report in JSON formatt\n\n You have to write keyword ONLY <example>keywords_1: [\"LLM\", ...]</example>.Answer should be surrounded with <keywords_1></keywords_1> tag.</RULE><objective>\n{objective}\n</objective><environment>\n{environment}\n</environment><keywords_mid_thought_1>\n{keywords_mid_thought_1}\n</keywords_mid_thought_1><EOS></EOS>"
    ]
}
keyworder1 = LLMComponent(json_data = json_data)

# 実行
memory = keyworder1(llm_name, memory)
pprint.pprint(memory, width=200)

*  Retriever1

In [None]:
# 設定
# Semantic Scholar API
# https://www.semanticscholar.org/product/api
# os.environ[""] = userdata.get('')
save_dir = "/content/papers1/"
search_variable = "keywords_1"
output_variable = "collection_of_papers_1"
# 検索するkeywordの数
num_keywords = 1
# 検索で取得する論文の数
num_retrieve_paper = 5
retriever1 = SemanticScholarRetriever(
    save_dir=save_dir, 
    search_variable=search_variable, 
    output_variable=output_variable,
    num_keywords=num_keywords,
    num_retrieve_paper=num_retrieve_paper
    )

# 実行
memory = retriever1(memory)
pprint.pprint(memory, width=200)

* Selector1

In [None]:
# 設定
json_data = {
    "input": ["objective", "environment", "collection_of_papers_1"],
    "output": ["selected_paper_1"],
    "prompt": """
    """
}
selector1 = LLMComponent(json_data = json_data)

# 実行
memory = selector1(llm_name, memory)
pprint.pprint(memory, width=200)

- Extractor1

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
extractor1 = LLMComponent(json_data = json_data)

# 実行
memory = extractor1(llm_name, memory)
pprint.pprint(memory, width=200)

* GithubRetriever1

In [None]:
# 設定
save_dir = "/content/repository1/"
search_variable = 'github_url_1'
output_variable = ['folder_structure_1', 'github_file_1']
githubretriever1 = GithubRetriever(save_dir=save_dir, search_variable=search_variable, output_variable=output_variable)

# 実行
memory = githubretriever1(memory)
pprint.pprint(memory, width=200)

- CodeExtractor1

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
codeextractor1 = LLMComponent(json_data = json_data)

# 実行
memory = codeextractor1(llm_name, memory)
pprint.pprint(memory, width=200)

- Decomposer1

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
decomposer1 = LLMComponent(json_data = json_data)

# 実行
memory = decomposer1(llm_name, memory)
pprint.pprint(memory, width=200)

- Keyworder2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
keyworder2 = LLMComponent(json_data = json_data)

# 実行
memory = keyworder2(llm_name, memory)
pprint.pprint(memory, width=200)

- Retriever2

In [None]:
# 設定
# Semantic Scholar API
# https://www.semanticscholar.org/product/api
# os.environ[""] = userdata.get('')
save_dir = "/content/papers2/"
search_variable = "keywords_2"
output_variable = "collection_of_papers_2"
# 検索するkeywordの数
num_keywords = 1
# 検索で取得する論文の数
num_retrieve_paper = 5
retriever2 = SemanticScholarRetriever(
    save_dir=save_dir, 
    search_variable=search_variable, 
    output_variable=output_variable,
    num_keywords=num_keywords,
    num_retrieve_paper=num_retrieve_paper
    )

# 実行
memory = retriever2(memory)
pprint.pprint(memory, width=200)

- Selector2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
selector2 = LLMComponent(json_data = json_data)

# 実行
memory = selector2(llm_name, memory)
pprint.pprint(memory, width=200)

- Extractor2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
extractor2 = LLMComponent(json_data = json_data)

# 実行
memory = extractor2(llm_name, memory)
pprint.pprint(memory, width=200)

- GithubRetriever2

In [None]:
# 設定
save_dir = "/content/repository2/"
search_variable = 'github_url_2'
output_variable = ['folder_structure_2', 'github_file_2']
githubretriever2 = GithubRetriever(save_dir=save_dir, search_variable=search_variable, output_variable=output_variable)

# 実行
memory = githubretriever2(memory)
pprint.pprint(memory, width=200)

- CodeExtractor2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
codeextractor2 = LLMComponent(json_data = json_data)

# 実行
memory = codeextractor2(llm_name, memory)
pprint.pprint(memory, width=200)

- Decomposer2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
decomposer2 = LLMComponent(json_data = json_data)

# 実行
memory = decomposer2(llm_name, memory)
pprint.pprint(memory, width=200)

- Creator

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
creator = LLMComponent(json_data = json_data)

# 実行
memory = creator(llm_name, memory)
pprint.pprint(memory, width=200)

* Velifier1

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
verifier1 = LLMComponent(json_data = json_data)

# 実行
memory = verifier1(llm_name, memory)
pprint.pprint(memory, width=200)

- Verifier2

In [None]:
# 設定
json_data = {
    "input": [],
    "output": [],
    "prompt": """
    """
}
verifier2 = LLMComponent(json_data = json_data)

# 実行
memory = verifier2(llm_name, memory)
pprint.pprint(memory, width=200)