-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
detailed_report.py
165 lines (132 loc) · 6.16 KB
/
detailed_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import asyncio
from fastapi import WebSocket
from gpt_researcher.master.actions import (
add_source_urls,
extract_headers,
table_of_contents,
)
from gpt_researcher.master.agent import GPTResearcher
from gpt_researcher.utils.enum import Tone
class DetailedReport:
def __init__(
self,
query: str,
report_type: str,
report_source: str,
source_urls,
config_path: str,
tone: Tone,
websocket: WebSocket,
subtopics=[],
headers=None
):
self.query = query
self.report_type = report_type
self.report_source = report_source
self.source_urls = source_urls
self.config_path = config_path
self.tone = tone
self.websocket = websocket
self.subtopics = subtopics
self.headers = headers or {}
# A parent task assistant. Adding research_report as default
self.main_task_assistant = GPTResearcher(
query=self.query,
report_type="research_report",
report_source=self.report_source,
source_urls=self.source_urls,
config_path=self.config_path,
tone=self.tone,
websocket=self.websocket,
headers=self.headers
)
self.existing_headers = []
# This is a global variable to store the entire context accumulated at any point through searching and scraping
self.global_context = []
# This is a global variable to store the entire url list accumulated at any point through searching and scraping
if self.source_urls:
self.global_urls = set(self.source_urls)
async def run(self):
# Conduct initial research using the main assistant
await self._initial_research()
# Get list of all subtopics
subtopics = await self._get_all_subtopics()
# Generate report introduction
report_introduction = await self.main_task_assistant.write_introduction()
# Generate the subtopic reports based on the subtopics gathered
_, report_body = await self._generate_subtopic_reports(subtopics)
# Construct the final list of visited urls
self.main_task_assistant.visited_urls.update(self.global_urls)
# Construct the final detailed report (Optionally add more details to the report)
report = await self._construct_detailed_report(report_introduction, report_body)
return report
async def _initial_research(self):
# Conduct research using the main task assistant to gather content for generating subtopics
await self.main_task_assistant.conduct_research()
# Update context of the global context variable
self.global_context = self.main_task_assistant.context
# Update url list of the global list variable
self.global_urls = self.main_task_assistant.visited_urls
async def _get_all_subtopics(self) -> list:
subtopics = await self.main_task_assistant.get_subtopics()
return subtopics.dict().get("subtopics", [])
async def _generate_subtopic_reports(self, subtopics: list) -> tuple:
subtopic_reports = []
subtopics_report_body = ""
async def fetch_report(subtopic):
subtopic_report = await self._get_subtopic_report(subtopic)
return {"topic": subtopic, "report": subtopic_report}
# This is the asyncio version of the same code below
# Although this will definitely run faster, the problem
# lies in avoiding duplicate information.
# To solve this the headers from previous subtopic reports are extracted
# and passed to the next subtopic report generation.
# This is only possible to do sequentially
for subtopic in subtopics:
result = await fetch_report(subtopic)
if result["report"]:
subtopic_reports.append(result)
subtopics_report_body += "\n\n\n" + result["report"]
return subtopic_reports, subtopics_report_body
async def _get_subtopic_report(self, subtopic: dict) -> str:
current_subtopic_task = subtopic.get("task")
subtopic_assistant = GPTResearcher(
query=current_subtopic_task,
report_type="subtopic_report",
report_source=self.report_source,
websocket=self.websocket,
headers=self.headers,
parent_query=self.query,
subtopics=self.subtopics,
visited_urls=self.global_urls,
agent=self.main_task_assistant.agent,
role=self.main_task_assistant.role,
tone=self.tone,
)
# The subtopics should start research from the context gathered till now
subtopic_assistant.context = list(set(self.global_context))
# Conduct research on the subtopic
await subtopic_assistant.conduct_research()
# Here the headers gathered from previous subtopic reports are passed to the write report function
# The LLM is later instructed to avoid generating any information relating to these headers as they have already been generated
subtopic_report = await subtopic_assistant.write_report(self.existing_headers)
# Update context of the global context variable
self.global_context = list(set(subtopic_assistant.context))
# Update url list of the global list variable
self.global_urls.update(subtopic_assistant.visited_urls)
# After a subtopic report has been generated then append the headers of the report to existing headers
self.existing_headers.append(
{
"subtopic task": current_subtopic_task,
"headers": extract_headers(subtopic_report),
}
)
return subtopic_report
async def _construct_detailed_report(self, introduction: str, report_body: str):
# Generating a table of contents from report headers
toc = table_of_contents(report_body)
# Concatenating all source urls at the end of the report
report_with_references = add_source_urls(
report_body, self.main_task_assistant.visited_urls
)
return f"{introduction}\n\n{toc}\n\n{report_with_references}"