/
ingest.py
177 lines (131 loc) · 5.39 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import json
import os
import sys
import time
import faiss
import numpy as np
import openai
import requests
import config
openai.api_key = config.OPENAI_KEY
def initialize_loading():
# create indices/ if it doesn't exist
if not os.path.exists("indices"):
os.makedirs("indices")
first_start = False
# create indices/current.json if it doesn't exist
if not os.path.exists("indices/current.json"):
first_start = True
with open("indices/current.json", "w") as f:
json.dump({"index": 1}, f)
with open("indices/current.json", "r") as f:
current_index = json.load(f)["index"]
if first_start or "--new" in sys.argv:
if not first_start:
current_index = current_index + 1
with open("indices/current.json", "w") as f:
json.dump({"index": current_index}, f)
# mkdir if it doesn't exist
if not os.path.exists("indices/" + str(current_index)):
os.makedirs("indices/" + str(current_index))
# read all files in logs
# if first start or "--new" is an argument, use fresh index
if first_start or "--new" in sys.argv:
vector_index = faiss.IndexFlatL2(1536)
schema = []
else:
# open most recent index, which should have the name "main.bin"
print("Opening index "+str(current_index))
vector_index = faiss.read_index("indices/" + str(current_index) + "/main_vector_index.bin")
with open("indices/" + str(current_index) + "/main_schema.json", "r") as f:
schema = json.load(f)
return vector_index, schema, current_index
def exponential_backoff(embedding_text: str, max_retries: int = 5):
"""
Retry a request if it fails due to rate limiting.
"""
num_retries = 0
while num_retries < max_retries:
try:
return openai.Embedding.create(
input=embedding_text, model="text-embedding-ada-002"
)
except openai.error.RateLimitError:
time.sleep(1 * num_retries)
num_retries += 1
print(f"Rate limited, retrying {num_retries} of {max_retries}...")
raise requests.exceptions.ConnectionError("Could not connect to OpenAI API")
def save_index_and_schema(
vector_index, schema: dict, current_index: int, stage: str = "main"
):
"""
Save index and schema to disk.
"""
print("Saving index and schema for stage " + stage)
dir = "indices/" + str(current_index) + "/" + stage
faiss.write_index(vector_index, dir + "_vector_index.bin")
with open(dir + "_schema.json", "w") as f:
json.dump(schema, f)
def get_embedding(vector_index, document: str, schema: list = []):
"""
Get embedding for a document and add it to the vector index.
"""
response = exponential_backoff(document["text"])
embeddings = response["data"][0]["embedding"]
vector_index.add(np.array([embeddings]).reshape(1, 1536))
schema.append(document)
time.sleep(1)
return vector_index, schema
def index_pending(
vector_index,
current_index: int,
schema: list = [],
chunking_mechanism: str = "words",
word_count: int = 750,
) -> tuple:
"""
Index all pending documents in pending_indexing/*.json.
Chunking mechanism can be either "words" or "paragraphs".
"words" will split documents into "words" word chunks.
"paragraphs" will split documents into paragraphs.
"paragraphs" is recommended for documents where paragraphs hold a lot of context.
If key context is not available at the paragraph level of a document -- such as may the the case for a wiki page, for instance -- "words" is recommended.
"""
# if not exists, return
if not os.path.exists("pending_indexing"):
return vector_index, schema
if not os.path.exists("indexed_docs"):
os.mkdir("indexed_docs")
for file in os.listdir("pending_indexing"):
if file.endswith(".json"):
with open("pending_indexing/" + file, "r") as f:
data = json.load(f)
sys.stdout.write(f"Indexing {file}\n")
sys.stdout.flush()
document_text = data["text"].split(" ")
# divide document into 750 word chunks, max
if chunking_mechanism == "words":
chunks = [
" ".join(document_text[i : i + word_count])
for i in range(0, len(document_text), word_count)
]
elif chunking_mechanism == "paragraphs":
chunks = data["text"].split("\n\n")
else:
raise ValueError("Invalid chunking mechanism.")
other_metadata = data.copy()
# remove text from other_metadata
del other_metadata["text"]
fully_assembled_docs = []
for chunk in chunks:
fully_assembled_docs.append({"text": chunk, **other_metadata})
print(" Indexing in "+str(len(chunks))+" chunks")
for data in fully_assembled_docs:
vector_index, schema = get_embedding(vector_index, data, schema)
save_index_and_schema(vector_index, schema, current_index, stage="main")
os.rename("pending_indexing/" + file, "indexed_docs/" + file)
return vector_index, schema
vector_index, schema, current_index = initialize_loading()
vector_index, schema = index_pending(
vector_index, current_index, schema, chunking_mechanism="words"
)