Skip to content

Commit

Permalink
Error solved while running celery task
Browse files Browse the repository at this point in the history
  • Loading branch information
bunseokbot committed Feb 26, 2019
1 parent b3ee3b1 commit 3de9cb7
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 71 deletions.
40 changes: 20 additions & 20 deletions crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from utils.network.headless import (
HeadlessBrowser, InvalidHTMLException, InvalidURLException
)
from utils.network.headless import HeadlessBrowser
from utils.network.socket import Socket
from utils.logging.log import Log
from utils.type.dynamic import DynamicObject
Expand All @@ -12,6 +10,7 @@
from pipeline.elastic import Elastic
from pipeline.elastic.documents import Webpage, Service, Port

from datetime import datetime
from urllib.parse import urlparse
from io import BytesIO

Expand All @@ -32,33 +31,32 @@ def scan(self, url):
"""Scan and crawl url which user requested."""
Log.i("Trying to crawl {} url".format(url))

domain = urlparse(url).netloc
obj = DynamicObject()

# Step 1. Visit website using headless tor browser
Log.d("Step 1. Visiting {} website using headless browser".format(url))

browser = HeadlessBrowser(
ini=self.ini,
tor_network=True
)

domain = urlparse(url).netloc
obj = DynamicObject()

try:
# Step 1. Visit website using headless tor browser
Log.d("Step 1. Visiting {} website using headless browser".format(url))
obj.webpage = browser.run(url)
report = browser.run(url)

# Step 2. Scan common service port
Log.d("Step 2. Scanning {} domain's common service port".format(domain))
obj.port = self._portscan(domain)
del browser

# Step 3. TO-DO
# if browser have an exception return from here
if not report:
return obj

except InvalidHTMLException:
Log.e("Invalid HTML returned from website")
obj.webpage = report

except InvalidURLException:
Log.e("Invalid URL or website is down")
# Step 2. Scan common service port
Log.d("Step 2. Scanning {} domain's common service port".format(domain))
obj.port = self._portscan(domain)

finally:
del browser
# Step 3. TO-DO

return obj

Expand Down Expand Up @@ -142,10 +140,12 @@ def save(self, id, obj):
url=obj.webpage.url,
domain=obj.webpage.domain,
title=obj.webpage.title,
time=datetime.now(),
source=obj.webpage.source,
screenshot=screenshot,
language=obj.webpage.language,
headers=obj.webpage.headers,
tree=obj.webpage.tree,
).save()

Port(
Expand Down
27 changes: 8 additions & 19 deletions crawler/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from celery import Task

from utils.logging.log import Log
from utils.config.ini import Ini
from utils.config.env import Env
Expand All @@ -8,24 +6,15 @@
from .celery import app


class CrawlerTask(Task):
name = "crawler"

def __init__(self):
super(Task, self).__init__()
Log.i("Starting crawler task")
@app.task(bind=True)
def run_crawler(self, url):
Log.i(f"Starting crawler task for {url}")

self.crawler = Crawler(
ini=Ini(Env.read('CONFIG_FILE')))
crawler = Crawler(ini=Ini(Env.read("CONFIG_FILE")))

def run(self, url):
"""Run crawler task and get result."""
Log.d(f"Receive {url} url from endpoint.")
report = self.crawler.scan(url)
if not report.is_empty():
Log.i(f"Saving {url} information to server.")
self.crawler.save(self.request.id, report)
report = crawler.scan(url)

if not report.is_empty() and report.webpage.url == url:
crawler.save(self.request.id, report)

# register task into app
app.register_task(CrawlerTask())
del crawler
9 changes: 4 additions & 5 deletions database/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from urllib.parse import urlparse

from sqlalchemy import Column, ForeignKey, Integer, String, Text, DateTime
from sqlalchemy.orm import relationship, backref
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base


Expand All @@ -16,11 +15,11 @@ class Domain(Base):
scheme = Column(String(5), nullable=False)
netloc = Column(String(255), unique=True, nullable=False)

def __init__(self, uuid, domain):
parse = urlparse(domain)
def __init__(self, uuid, url):
# onion domain condition check routine
parse = urlparse(url)
scheme, netloc = parse.scheme, parse.netloc

# onion domain condition check routine
if not netloc.endswith('.onion'):
raise ValueError("Invalid onion domain")

Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ services:
dockerfile: Dockerfile
image: darklight:latest
container_name: worker
command: ["celery", "worker", "-A", "crawler.tasks", "--concurrency=4"]
command: ["celery", "worker", "-A", "crawler.tasks", "--autoscale=1,7"]
environment:
- CONFIG_FILE=config.ini
networks:
Expand Down
3 changes: 2 additions & 1 deletion pipeline/elastic/documents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from elasticsearch_dsl import Document, Integer, Keyword, Text,\
Nested, Boolean, InnerDoc, Index
Nested, Boolean, InnerDoc, Date


class Header(InnerDoc):
Expand All @@ -18,6 +18,7 @@ class Webpage(Document):
url = Keyword()
domain = Keyword()
title = Text()
time = Date()
screenshot = Keyword()
source = Text()
language = Keyword()
Expand Down
14 changes: 8 additions & 6 deletions source/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from uuid import uuid4

from crawler.tasks import CrawlerTask
from crawler.tasks import run_crawler

from database.session import Session
from database.engine import Engine
Expand Down Expand Up @@ -41,12 +41,14 @@ def save(self):
task_id = uuid4().hex

try:
session.add(Domain(task_id, url))
task = CrawlerTask().apply_async(args=[url], task_id=task_id)
Log.i("CrawlerTask issued a new task id: {}".format(task.task_id))
# add url into database
session.add(Domain(uuid=task_id, url=url))
session.commit()

task = run_crawler.apply_async(args=(url, ), task_id=task_id)
Log.i("Crawler issued a new task id {} at {}".format(
task.task_id, url))
except:
Log.d("This {} url already saved into database.".format(url))
finally:
self.urls.remove(url)

session.commit()
29 changes: 10 additions & 19 deletions utils/network/headless.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,11 @@
from io import BytesIO

from utils.type.dynamic import DynamicObject
from utils.logging.log import Log

import json


class InvalidURLException(Exception):
# Invalid URL or Website is closed
pass


class InvalidHTMLException(Exception):
# Invalid HTML Format
pass


class BrowserException(Exception):
# Browser got exception when headless browser return as adnormal
pass


class HeadlessBrowser:
"""Selenium headless browser for crawling information."""

Expand Down Expand Up @@ -62,24 +48,26 @@ def run(self, url):
self.driver.get(url)
except:
# browser scan failed
raise BrowserException
Log.e("Browser has an error.")
return

# if driver source is none
if not self.get_source():
raise InvalidURLException
return

# run HTML parser for parse data from source
try:
# beautifulsoup object for parse html source
self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except:
# website source code is not HTML
raise InvalidHTMLException
Log.e("Invalid HTML Source code.")
return

# get HAR from driver
self.har = json.loads(self.driver.get_log('har')[0]['message'])

return DynamicObject({
report = DynamicObject({
'url': url,
'domain': urlparse(url).netloc,
'title': self.get_title(),
Expand All @@ -91,6 +79,8 @@ def run(self, url):
'tree': self.get_website_tree(),
})

return report

def get_website_tree(self):
"""Get webpage tree (entries) and load status."""
tree = []
Expand Down Expand Up @@ -175,3 +165,4 @@ def get_screenshot(self):

def __del__(self):
self.driver.quit()
del self

0 comments on commit 3de9cb7

Please sign in to comment.