Skip to content

Commit

Permalink
BITMAKER-3113: Allow data download via estela-web (#199)
Browse files Browse the repository at this point in the history
* Allow data download via estela-web
* Increase estela-cli download chunk size to 2MB
* Allow downloading data in JSON, CSV, and TSV formats
* Add restart policy for Kafka in docker-compose
  • Loading branch information
mgonnav committed Jun 27, 2023
1 parent 0ea7c39 commit 25f5fc5
Show file tree
Hide file tree
Showing 12 changed files with 6,004 additions and 6,434 deletions.
5 changes: 5 additions & 0 deletions database_adapters/db_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ def delete_collection_data(self, database_name, collection_name):
print(ex)
return count

def get_collection_data(self, database_name, collection_name, limit=10000):
collection = self.client[database_name][collection_name]
result = collection.find({}, {"_id": False}).limit(limit)
return list(result)

def get_all_collection_data(self, database_name, collection_name):
collection = self.client[database_name][collection_name]
result = collection.find({}, {"_id": False})
Expand Down
89 changes: 74 additions & 15 deletions estela-api/api/views/job_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from django.conf import settings
from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema
from rest_framework import status, mixins
from rest_framework.response import Response
from rest_framework.exceptions import ParseError
from rest_framework import mixins, status
from rest_framework.decorators import action
from rest_framework.exceptions import ParseError
from rest_framework.response import Response
from rest_framework.utils.urls import replace_query_param

from api import errors
Expand Down Expand Up @@ -93,18 +93,7 @@ def list(self, request, *args, **kwargs):
raise DataBaseError({"error": errors.UNABLE_CONNECT_DB})

job = SpiderJob.objects.filter(jid=kwargs["jid"]).get()
if (
job.cronjob is not None
and job.cronjob.unique_collection
and data_type == "items"
):
job_collection_name = "{}-scj{}-job_{}".format(
kwargs["sid"], job.cronjob.cjid, data_type
)
else:
job_collection_name = "{}-{}-job_{}".format(
kwargs["sid"], kwargs["jid"], data_type
)
job_collection_name = self.get_collection_name(job, data_type, **kwargs)

count = spiderdata_db_client.get_estimated_document_count(
kwargs["pid"], job_collection_name
Expand Down Expand Up @@ -145,6 +134,76 @@ def list(self, request, *args, **kwargs):
}
)

def get_collection_name(self, job, data_type, **kwargs):
if (
job.cronjob is not None
and job.cronjob.unique_collection
and data_type == "items"
):
job_collection_name = "{}-scj{}-job_{}".format(
kwargs["sid"], job.cronjob.cjid, data_type
)
else:
job_collection_name = "{}-{}-job_{}".format(
kwargs["sid"], kwargs["jid"], data_type
)

return job_collection_name

@swagger_auto_schema(
methods=["GET"],
responses={
status.HTTP_200_OK: openapi.Response(
schema=openapi.Schema(
type=openapi.TYPE_OBJECT,
required=["results"],
properties={
"results": openapi.Schema(
type=openapi.TYPE_ARRAY,
items=openapi.Items(type=openapi.TYPE_OBJECT),
description="Data items.",
),
},
),
description="",
),
},
manual_parameters=[
openapi.Parameter(
"type",
openapi.IN_QUERY,
description="Spider job data type.",
type=openapi.TYPE_STRING,
required=False,
),
],
)
@action(detail=False, methods=["GET"])
def download(self, request, *args, **kwargs):
data_type = request.query_params.get("type", "items")

job = SpiderJob.objects.filter(jid=kwargs["jid"]).get()
job_collection_name = self.get_collection_name(job, data_type, **kwargs)

data = []
if data_type == "stats":
data = spiderdata_db_client.get_job_stats(
kwargs["pid"], job_collection_name
)
else:
docs_limit = max(
1,
(settings.MAX_WEB_DOWNLOAD_SIZE)
// spiderdata_db_client.get_estimated_document_size(
kwargs["pid"], job_collection_name
),
)
data = spiderdata_db_client.get_collection_data(
kwargs["pid"], job_collection_name, docs_limit
)

return Response({"results": data})

@swagger_auto_schema(
methods=["POST"],
responses={status.HTTP_200_OK: DeleteJobDataSerializer()},
Expand Down
5 changes: 2 additions & 3 deletions estela-api/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,8 @@


# API limit data download settings (bytes)
MAX_DOWNLOADED_SIZE = 8 * 1024 * 1024
MAX_CHUNK_SIZE = 512 * 1024

MAX_CHUNK_SIZE = 2 * 1024 * 1024
MAX_WEB_DOWNLOAD_SIZE = 100 * 1024 * 1024

# Pagination settings used in api_app
API_PAGE_SIZE = 100 # Paginator page size
Expand Down
48 changes: 48 additions & 0 deletions estela-api/docs/api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,54 @@ paths:
in: path
required: true
type: string
/api/projects/{pid}/spiders/{sid}/jobs/{jid}/data/download:
get:
operationId: api_projects_spiders_jobs_data_download
description: ''
parameters:
- name: page
in: query
description: A page number within the paginated result set.
required: false
type: integer
- name: page_size
in: query
description: Number of results to return per page.
required: false
type: integer
- name: type
in: query
description: Spider job data type.
required: false
type: string
responses:
'200':
description: ''
schema:
required:
- results
type: object
properties:
results:
description: Data items.
type: array
items:
type: object
tags:
- api
parameters:
- name: jid
in: path
required: true
type: string
- name: pid
in: path
required: true
type: string
- name: sid
in: path
required: true
type: string
/api/projects/{pid}/usage:
get:
operationId: api_projects_usage
Expand Down
1 change: 1 addition & 0 deletions estela-web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"moment": "^2.29.4",
"node-fetch": "^2.6.7",
"nth-check": "^2.0.1",
"papaparse": "^5.4.1",
"react": "^17.0.2",
"react-chartjs-2": "^5.1.0",
"react-dom": "^17.0.2",
Expand Down

0 comments on commit 25f5fc5

Please sign in to comment.