Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added reddit loader #877

Merged
merged 5 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ target/

# Jupyter Notebook
.ipynb_checkpoints
**/*.ipynb

# IPython
profile_default/
Expand Down
25 changes: 25 additions & 0 deletions application/parser/remote/reddit_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from application.parser.remote.base import BaseRemote
from langchain_community.document_loaders import RedditPostsLoader


class RedditPostsLoaderRemote(BaseRemote):
def load_data(self, inputs):
data = eval(inputs)
client_id = data.get("client_id")
client_secret = data.get("client_secret")
user_agent = data.get("user_agent")
categories = data.get("categories", ["new", "hot"])
mode = data.get("mode", "subreddit")
search_queries = data.get("search_queries")
self.loader = RedditPostsLoader(

Check warning on line 14 in application/parser/remote/reddit_loader.py

View check run for this annotation

Codecov / codecov/patch

application/parser/remote/reddit_loader.py#L7-L14

Added lines #L7 - L14 were not covered by tests
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
categories=categories,
mode=mode,
search_queries=search_queries,
number_posts=10,
siiddhantt marked this conversation as resolved.
Show resolved Hide resolved
)
documents = self.loader.load()
print(f"Loaded {len(documents)} documents from Reddit")
return documents

Check warning on line 25 in application/parser/remote/reddit_loader.py

View check run for this annotation

Codecov / codecov/patch

application/parser/remote/reddit_loader.py#L23-L25

Added lines #L23 - L25 were not covered by tests
10 changes: 6 additions & 4 deletions application/parser/remote/remote_creator.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from application.parser.remote.sitemap_loader import SitemapLoader
from application.parser.remote.crawler_loader import CrawlerLoader
from application.parser.remote.web_loader import WebLoader
from application.parser.remote.reddit_loader import RedditPostsLoaderRemote


class RemoteCreator:
loaders = {
'url': WebLoader,
'sitemap': SitemapLoader,
'crawler': CrawlerLoader
"url": WebLoader,
"sitemap": SitemapLoader,
"crawler": CrawlerLoader,
"reddit": RedditPostsLoaderRemote,
}

@classmethod
def create_loader(cls, type, *args, **kwargs):
loader_class = cls.loaders.get(type.lower())
if not loader_class:
raise ValueError(f"No LLM class found for type {type}")
return loader_class(*args, **kwargs)
return loader_class(*args, **kwargs)

Check warning on line 20 in application/parser/remote/remote_creator.py

View check run for this annotation

Codecov / codecov/patch

application/parser/remote/remote_creator.py#L20

Added line #L20 was not covered by tests
136 changes: 82 additions & 54 deletions application/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,27 @@
from application.parser.token_func import group_split

try:
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
except FileExistsError:
pass


# Define a function to extract metadata from a given filename.
def metadata_from_filename(title):
store = '/'.join(title.split('/')[1:3])
return {'title': title, 'store': store}
store = "/".join(title.split("/")[1:3])
return {"title": title, "store": store}

Check warning on line 27 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L26-L27

Added lines #L26 - L27 were not covered by tests


# Define a function to generate a random string of a given length.
def generate_random_string(length):
return ''.join([string.ascii_letters[i % 52] for i in range(length)])
return "".join([string.ascii_letters[i % 52] for i in range(length)])

Check warning on line 32 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L32

Added line #L32 was not covered by tests


current_dir = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)

current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Define the main function for ingesting and processing documents.
def ingest_worker(self, directory, formats, name_job, filename, user):
Expand Down Expand Up @@ -62,109 +66,133 @@
token_check = True
min_tokens = 150
max_tokens = 1250
full_path = directory + '/' + user + '/' + name_job
full_path = directory + "/" + user + "/" + name_job

Check warning on line 69 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L69

Added line #L69 was not covered by tests
import sys

print(full_path, file=sys.stderr)
# check if API_URL env variable is set
file_data = {'name': name_job, 'file': filename, 'user': user}
response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data)
file_data = {"name": name_job, "file": filename, "user": user}
response = requests.get(

Check warning on line 75 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L74-L75

Added lines #L74 - L75 were not covered by tests
urljoin(settings.API_URL, "/api/download"), params=file_data
)
# check if file is in the response
print(response, file=sys.stderr)
file = response.content

if not os.path.exists(full_path):
os.makedirs(full_path)
with open(full_path + '/' + filename, 'wb') as f:
with open(full_path + "/" + filename, "wb") as f:

Check warning on line 84 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L84

Added line #L84 was not covered by tests
f.write(file)

# check if file is .zip and extract it
if filename.endswith('.zip'):
with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
if filename.endswith(".zip"):
with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref:

Check warning on line 89 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L88-L89

Added lines #L88 - L89 were not covered by tests
zip_ref.extractall(full_path)
os.remove(full_path + '/' + filename)

self.update_state(state='PROGRESS', meta={'current': 1})

raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
os.remove(full_path + "/" + filename)

Check warning on line 91 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L91

Added line #L91 was not covered by tests

self.update_state(state="PROGRESS", meta={"current": 1})

Check warning on line 93 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L93

Added line #L93 was not covered by tests

raw_docs = SimpleDirectoryReader(

Check warning on line 95 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L95

Added line #L95 was not covered by tests
input_dir=full_path,
input_files=input_files,
recursive=recursive,
required_exts=formats,
num_files_limit=limit,
exclude_hidden=exclude,
file_metadata=metadata_from_filename,
).load_data()
raw_docs = group_split(

Check warning on line 104 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L104

Added line #L104 was not covered by tests
documents=raw_docs,
min_tokens=min_tokens,
max_tokens=max_tokens,
token_check=token_check,
)

docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

call_openai_api(docs, full_path, self)
self.update_state(state='PROGRESS', meta={'current': 100})
self.update_state(state="PROGRESS", meta={"current": 100})

Check warning on line 114 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L114

Added line #L114 was not covered by tests

if sample:
for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text)

# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
file_data = {'name': name_job, 'user': user}
file_data = {"name": name_job, "user": user}

Check warning on line 122 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L122

Added line #L122 was not covered by tests
if settings.VECTOR_STORE == "faiss":
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
'file_pkl': open(full_path + '/index.pkl', 'rb')}
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
files = {

Check warning on line 124 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L124

Added line #L124 was not covered by tests
"file_faiss": open(full_path + "/index.faiss", "rb"),
"file_pkl": open(full_path + "/index.pkl", "rb"),
}
response = requests.post(

Check warning on line 128 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L128

Added line #L128 was not covered by tests
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
)
response = requests.get(

Check warning on line 131 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L131

Added line #L131 was not covered by tests
urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)
)
else:
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
response = requests.post(

Check warning on line 135 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L135

Added line #L135 was not covered by tests
urljoin(settings.API_URL, "/api/upload_index"), data=file_data
)


# delete local
shutil.rmtree(full_path)

return {
'directory': directory,
'formats': formats,
'name_job': name_job,
'filename': filename,
'user': user,
'limited': False
"directory": directory,
"formats": formats,
"name_job": name_job,
"filename": filename,
"user": user,
"limited": False,
}

def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'):

def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
# sample = False
token_check = True
min_tokens = 150
max_tokens = 1250
full_path = directory + '/' + user + '/' + name_job
full_path = directory + "/" + user + "/" + name_job

Check warning on line 157 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L157

Added line #L157 was not covered by tests

if not os.path.exists(full_path):
os.makedirs(full_path)

self.update_state(state='PROGRESS', meta={'current': 1})
self.update_state(state="PROGRESS", meta={"current": 1})

Check warning on line 162 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L162

Added line #L162 was not covered by tests

# source_data {"data": [url]} for url type task just urls

# Use RemoteCreator to load data from URL
remote_loader = RemoteCreator.create_loader(loader)
raw_docs = remote_loader.load_data(source_data)

docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
docs = group_split(

Check warning on line 170 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L170

Added line #L170 was not covered by tests
documents=raw_docs,
min_tokens=min_tokens,
max_tokens=max_tokens,
token_check=token_check,
)

#docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

call_openai_api(docs, full_path, self)
self.update_state(state='PROGRESS', meta={'current': 100})


self.update_state(state="PROGRESS", meta={"current": 100})

Check warning on line 180 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L180

Added line #L180 was not covered by tests

# Proceed with uploading and cleaning as in the original function
file_data = {'name': name_job, 'user': user}
file_data = {"name": name_job, "user": user}

Check warning on line 183 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L183

Added line #L183 was not covered by tests
if settings.VECTOR_STORE == "faiss":
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
'file_pkl': open(full_path + '/index.pkl', 'rb')}
requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
files = {

Check warning on line 185 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L185

Added line #L185 was not covered by tests
"file_faiss": open(full_path + "/index.faiss", "rb"),
"file_pkl": open(full_path + "/index.pkl", "rb"),
}
requests.post(

Check warning on line 189 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L189

Added line #L189 was not covered by tests
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
)
requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
else:
requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)

shutil.rmtree(full_path)

return {
'urls': source_data,
'name_job': name_job,
'user': user,
'limited': False
}
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}

Check warning on line 198 in application/worker.py

View check run for this annotation

Codecov / codecov/patch

application/worker.py#L198

Added line #L198 was not covered by tests
4 changes: 2 additions & 2 deletions frontend/src/components/Dropdown.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ function Dropdown({
isOpen
? typeof selectedValue === 'string'
? 'rounded-t-xl'
: 'rounded-t-2xl'
: 'rounded-t-3xl'
: typeof selectedValue === 'string'
? 'rounded-xl'
: 'rounded-full'
: 'rounded-3xl'
}`}
>
{typeof selectedValue === 'string' ? (
Expand Down
Loading
Loading