Skip to content

Commit

Permalink
feat(ingest): refactor ingest workers into seperate services
Browse files Browse the repository at this point in the history
  • Loading branch information
Yakabuff committed Sep 16, 2023
1 parent f7098c5 commit 4b20c4c
Show file tree
Hide file tree
Showing 23 changed files with 176 additions and 117 deletions.
2 changes: 2 additions & 0 deletions api/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
logs
*.log
File renamed without changes.
30 changes: 30 additions & 0 deletions default.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
REDARC_API=http://redarc.mysite.org/api/
SERVER_NAME=redarc.mysite.org
ADMIN_PASSWORD: "qwerty"
INGEST_PASSWORD: "asdf"
IMAGE_PATH: "/your/path"

INGEST_ENABLED=true
INDEX_ENABLED=true
PG_DATABASE=postgres
PG_USER=postgres
PG_PASSWORD=test1234
PG_HOST=pgsql-dev
PG_PORT=5432
PGFTS_DATABASE=postgres
PGFTS_USER=postgres
PGFTS_PASSWORD=test1234
PGFTS_HOST=pgsql-fts
PGFTS_PORT=5432

CLIENT_ID="change me"
CLIENT_SECRET="change me"
PASSWORD="change me"
USER_AGENT="my user agent"
REDDIT_USERNAME="change me"
INDEX_DELAY=300
SUBREDDITS="asdf,asdf"
FETCH_DELAY=43200
DOWNLOAD_IMAGES: true
REDIS_HOST: localhost
REDIS_PORT: 6379
123 changes: 73 additions & 50 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,14 @@ services:

redarc:
build:
context: .
context: ./api
dockerfile: Dockerfile
image: redarc
container_name: redarc
networks:
- redarc
environment:
# Replace redarc.mysite.org with your hostname
REDARC_API: http://redarc.mysite.org/api/
SERVER_NAME: redarc.mysite.org
PG_DATABASE: postgres
PG_USER: postgres
PG_PASSWORD: test1234
PG_HOST: pgsql-dev
PG_PORT: 5432
PGFTS_DATABASE: postgres
PGFTS_USER: postgres
PGFTS_PASSWORD: test1234
PGFTS_HOST: pgsql-fts
PGFTS_PORT: 5432
ADMIN_PASSWORD: "qwerty"
INGEST_ENABLED: false
INGEST_PASSWORD: "asdf"
REDIS_HOST: "redarc_ingest"
REDIS_PORT: 6379
IMAGE_PATH: "/your/path"
env_file:
- .env
volumes:
- redarc_api_logs:/redarc/api/logs
- redarc_images:/ingest/gallery-dl
Expand All @@ -72,49 +54,90 @@ services:
condition: service_healthy
postgres_fts:
condition: service_healthy
redis:
condition: service_healthy

redis:
image: redis:7.0.12-alpine3.18
networks:
- redarc
ports:
- '6379:6379'
command: redis-server
healthcheck:
test: ["CMD", "redis-cli","ping"]
timeout: 10s
retries: 10

redarc_ingest:
image_downloader:
build:
context: ./ingest
context: ./ingest/image_downloader
dockerfile: Dockerfile
image: redarc_ingest
container_name: redarc_ingest
image: image_downloader
container_name: image_downloader
networks:
- redarc
environment:
INGEST_ENABLED: true
INDEX_ENABLED: true
PG_DATABASE: postgres
PG_USER: postgres
PG_PASSWORD: test1234
PG_HOST: pgsql-dev
PG_PORT: 5432
PGFTS_DATABASE: postgres
PGFTS_USER: postgres
PGFTS_PASSWORD: test1234
PGFTS_HOST: pgsql-fts
PGFTS_PORT: 5432
# Reddit credentials
CLIENT_ID: change me
CLIENT_SECRET: change me
PASSWORD: change me
USER_AGENT: "my user agent"
REDDIT_USERNAME: "change me"
INDEX_DELAY: 300
SUBREDDITS: "asdf,asdf"
FETCH_DELAY: 43200
DOWNLOAD_IMAGES: true
REDIS_HOST: localhost
REDIS_PORT: 6379
env_file:
- .env
volumes:
- redarc_ingest_logs:/ingest/logs
- redarc_images:/ingest/gallery-dl
depends_on:
redis:
condition: service_healthy

index_worker:
build:
context: ./ingest/index_worker
dockerfile: Dockerfile
image: index_worker
container_name: index_worker
networks:
- redarc
env_file:
- .env
volumes:
- redarc_ingest_logs:/ingest/logs
depends_on:
postgres:
condition: service_healthy
postgres_fts:
condition: service_healthy

reddit_worker:
build:
context: ./ingest/reddit_worker
dockerfile: Dockerfile
image: reddit_worker
container_name: reddit_worker
networks:
- redarc
env_file:
- .env
volumes:
- redarc_ingest_logs:/ingest/logs
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy

subreddit_worker:
build:
context: ./ingest/subreddit_worker
dockerfile: Dockerfile
image: subreddit_worker
container_name: subreddit_worker
networks:
- redarc
env_file:
- .env
volumes:
- redarc_ingest_logs:/ingest/logs
depends_on:
redis:
condition: service_healthy

networks:
redarc:
driver: bridge
Expand Down
17 changes: 11 additions & 6 deletions ingest/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# redarc-ingest

gunicorn
falcon
rq
dotenv
praw
psycogp2
### Image downloader:
- Downloads images queued by the Reddit worker

### Reddit worker:
- Fetches threads and comments and queues images to be downloaded

### Subreddit worker:
- Retrieves hot/new/rising threads and queues them for Reddit worker to process periodically

### Index worker:
- Periodically indexes the databases
22 changes: 0 additions & 22 deletions ingest/entry.sh

This file was deleted.

2 changes: 2 additions & 0 deletions ingest/image_downloader/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
logs
*.log
15 changes: 15 additions & 0 deletions ingest/image_downloader/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM alpine3.18

RUN apk update

RUN apk add bash python3 py3-pip

RUN mkdir -p /image_downloader
WORKDIR /image_downloader
COPY . .

RUN pip install rq
RUN pip install python-dotenv
RUN pip install gallery-dl

CMD ["/bin/python3", "image_downloader.py"]
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions ingest/index_worker/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
logs
*.log
14 changes: 14 additions & 0 deletions ingest/index_worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM alpine3.18

RUN apk update

RUN apk add bash python3 py3-pip postgresql-client

RUN mkdir -p /index_worker
WORKDIR /index_worker
COPY . .

RUN pip install python-dotenv
RUN pip install psycopg2-binary

CMD ["/bin/python3", "index_worker.py"]
File renamed without changes.
2 changes: 2 additions & 0 deletions ingest/reddit_worker/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
logs
*.log
10 changes: 4 additions & 6 deletions ingest/Dockerfile → ingest/reddit_worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
FROM redis:7.0.12-alpine3.18
FROM alpine3.18

RUN apk update

RUN apk add bash python3 py3-pip postgresql-client

RUN mkdir -p /ingest
WORKDIR /ingest
RUN mkdir -p /reddit_worker
WORKDIR /reddit_worker
COPY . .

RUN pip install rq
RUN pip install python-dotenv
RUN pip install praw
RUN pip install psycopg2-binary
RUN pip install gallery-dl

RUN chmod +x entry.sh
CMD ["/bin/bash", "entry.sh"]
CMD ["/bin/python3", "reddit_worker.py"]
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import traceback
from redis import Redis
import logging
from worker.validate import validate_submission, validate_comment
from validate import validate_submission, validate_comment
import time
import praw
from enum import Enum
Expand Down Expand Up @@ -109,7 +109,7 @@ def process_submission(submission):
if x != None:
try:
if 'i.redd.it' in x['url'] and os.getenv('DOWNLOAD_IMAGES') == 'true':
job = img_queue.enqueue('worker.image_downloader.download_image', subreddit=x['subreddit'], url=x['url'])
job = img_queue.enqueue('image_downloader.download_image', subreddit=x['subreddit'], url=x['url'])
if job.get_status(refresh=True) != "queued":
logging.error(f"Failed to enqueue image url: {x['url']} thread url: {x['permalink']}")
else:
Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions ingest/subreddit_worker/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
logs
*.log
16 changes: 16 additions & 0 deletions ingest/subreddit_worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM alpine3.18

RUN apk update

RUN apk add bash python3 py3-pip

RUN mkdir -p /subreddit_worker
WORKDIR /subreddit_worker
COPY . .

RUN pip install rq
RUN pip install python-dotenv
RUN pip install praw

RUN chmod +x entry.sh
CMD ["/bin/python3", "subreddit_worker"]
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def watch_subreddit(subreddit):
rising = reddit.subreddit(subreddit).rising(limit=25)
for r in rising:
id = hashlib.md5(r.id.encode('utf-8')).hexdigest()
exists = job_exists(id)
if not id in ids:
ids[id] = (r.id, r.permalink)
return ids
Expand All @@ -77,7 +76,7 @@ def work():
try:
if not job_exists(id):
logging.info("Queuing thread id "+ id)
job = url_queue.enqueue('worker.reddit_worker.fetch_thread', thread_id=id, url=url, job_id=i)
job = url_queue.enqueue('reddit_worker.fetch_thread', thread_id=id, url=url, job_id=i)
if job.get_status(refresh=True) != "queued":
logging.error(f"Failed to enqueue job: thread ID {id}")
except Exception as error:
Expand Down
11 changes: 0 additions & 11 deletions ingest/worker/default.env

This file was deleted.

Loading

0 comments on commit 4b20c4c

Please sign in to comment.