Skip to content

Commit

Permalink
Merge e69f5de into 3fccfab
Browse files Browse the repository at this point in the history
  • Loading branch information
tunstek committed Nov 8, 2019
2 parents 3fccfab + e69f5de commit dd0a72f
Show file tree
Hide file tree
Showing 45 changed files with 1,023 additions and 399 deletions.
25 changes: 14 additions & 11 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@ sudo: required
language: python
cache: pip
python:
- 3.3
- 3.4
- 3.5
- 3.6
matrix:
allow_failures:
- python: 2.7
- python: 3.7
dist: xenial
- 3.7
#- 3.8
services:
- docker
- mongodb
Expand All @@ -21,23 +17,30 @@ services:
- postgresql
addons:
postgresql: "9.4"
apt:
packages:
- rabbitmq-server

before_install:
- echo "deb https://apache.bintray.com/couchdb-deb xenial main" | sudo tee -a /etc/apt/sources.list
- curl -L https://couchdb.apache.org/repo/bintray-pubkey.asc | sudo apt-key add -
- sudo apt-get update -qq
- sudo apt-get install -y beanstalkd
- echo "START=yes" | sudo tee -a /etc/default/beanstalkd > /dev/null
- sudo service beanstalkd start
- sudo apt-get install -y couchdb
- sudo systemctl start couchdb
- curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
- npm install express puppeteer
- sudo docker pull scrapinghub/splash
- sudo docker run -d --net=host scrapinghub/splash
before_script:
- curl -X PUT http://127.0.0.1:5984/_users
- curl -X PUT http://127.0.0.1:5984/_replicator
- psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- sleep 10
install:
- pip install https://github.com/marcus67/easywebdav/archive/master.zip
- if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then sudo apt-get install libc6; fi
- if [[ $TRAVIS_PYTHON_VERSION == '3.7' ]]; then sudo apt-get install libgnutls28-dev; fi
- sudo apt-get install libgnutls28-dev
- pip install -e .[all,test]
- pip install coveralls
script:
Expand Down
13 changes: 7 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:2.7
FROM python:3.6
MAINTAINER binux <roy@binux.me>

# install phantomjs
Expand All @@ -8,19 +8,19 @@ RUN mkdir -p /opt/phantomjs \
&& tar xavf phantomjs.tar.bz2 --strip-components 1 \
&& ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \
&& rm phantomjs.tar.bz2
# Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory
ENV OPENSSL_CONF=/etc/ssl/

# install nodejs
ENV NODEJS_VERSION=8.15.0 \
PATH=$PATH:/opt/node/bin

WORKDIR "/opt/node"

RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \
curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \
rm -rf /var/lib/apt/lists/*
RUN npm install puppeteer express

# install requirements
RUN pip install 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'
COPY requirements.txt /opt/pyspider/requirements.txt
RUN pip install -r /opt/pyspider/requirements.txt

Expand All @@ -31,9 +31,10 @@ ADD ./ /opt/pyspider
WORKDIR /opt/pyspider
RUN pip install -e .[all]

RUN npm i puppeteer express
# Create a symbolic link to node_modules
RUN ln -s /opt/node/node_modules ./node_modules

VOLUME ["/opt/pyspider"]
#VOLUME ["/opt/pyspider"]
ENTRYPOINT ["pyspider"]

EXPOSE 5000 23333 24444 25555 22222
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**

- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Beanstalk](http://kr.github.com/beanstalkd/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...

Expand Down
11 changes: 11 additions & 0 deletions config_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"taskdb": "couchdb+taskdb://couchdb:5984",
"projectdb": "couchdb+projectdb://couchdb:5984",
"resultdb": "couchdb+resultdb://couchdb:5984",
"message_queue": "amqp://rabbitmq:5672/%2F",
"webui": {
"username": "username",
"password": "password",
"need-auth": true
}
}
117 changes: 117 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
version: "3.7"

# docker build ./ -t pyspider:latest
# replace /path/to/dir/ to point to config_example.json

services:
rabbitmq:
image: rabbitmq:alpine
container_name: rabbitmq
networks:
- pyspider
command: rabbitmq-server
couchdb:
image: couchdb:latest
container_name: couchdb
networks:
- pyspider
ports:
- "5984:5984"
environment:
- COUCHDB_NAME=couchdb
- COUCHDB_USER=user
- COUCHDB_PASSWORD=password
- COUCHDB_HTTPS=true
# OR we can replace couchdb with mysql
#mysql:
# image: mysql:latest
# container_name: mysql
# volumes:
# - /tmp:/var/lib/mysql
# environment:
# - MYSQL_ALLOW_EMPTY_PASSWORD=yes
# networks:
# - pyspider
phantomjs:
image: pyspider:latest
container_name: phantomjs
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json phantomjs
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped
result:
image: pyspider:latest
container_name: result
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json result_worker
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start
processor:
container_name: processor
image: pyspider:latest
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json processor
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped
fetcher:
image: pyspider:latest
container_name: fetcher
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command : -c config.json fetcher
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped
scheduler:
image: pyspider:latest
container_name: scheduler
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
command: -c config.json scheduler
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped
webui:
image: pyspider:latest
container_name: webui
ports:
- "5050:5000"
networks:
- pyspider
volumes:
- /path/to/dir/config_example.json:/opt/pyspider/config.json
environment:
- SCHEDULER_PORT_23333_TCP_ADDR=scheduler
command: -c config.json webui
depends_on:
- couchdb
- rabbitmq
restart: unless-stopped

networks:
pyspider:
external:
name: pyspider
default:
driver: bridge
4 changes: 2 additions & 2 deletions docs/Command-Line.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ sqlite:
mongodb:
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
couchdb+type://[username:password@]host[:port]
sqlalchemy:
sqlalchemy+postgresql+type://user:passwd@host:port/database
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
Expand All @@ -90,8 +92,6 @@ type:
rabbitmq:
amqp://username:password@host:5672/%2F
see https://www.rabbitmq.com/uri-spec.html
beanstalk:
beanstalk://host:11300/
redis:
redis://host:6379/db
redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
Expand Down
8 changes: 4 additions & 4 deletions docs/Deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ To deploy pyspider in product environment, running component in each process and
Installation
------------

To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.

And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/), [Beanstalk](http://kr.github.io/beanstalkd/) or [Redis](http://redis.io/) as message queue.
And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.

`pip install --allow-all-external pyspider[all]`

Expand Down Expand Up @@ -63,6 +63,8 @@ sqlite:
mongodb:
mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
couchdb+type://[username:password@]host[:port][?options]]
sqlalchemy:
sqlalchemy+postgresql+type://user:passwd@host:port/database
sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
Expand All @@ -81,8 +83,6 @@ You can use connection URL to specify the message queue:
rabbitmq:
amqp://username:password@host:5672/%2F
Refer: https://www.rabbitmq.com/uri-spec.html
beanstalk:
beanstalk://host:11300/
redis:
redis://host:6379/db
redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
Expand Down
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**

- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Beanstalk](http://kr.github.com/beanstalkd/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2&3, etc...

Expand Down
48 changes: 48 additions & 0 deletions pyspider/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# http://binux.me
# Created on 2014-10-08 15:04:08

import os, requests, json
from six.moves.urllib.parse import urlparse, parse_qs


Expand Down Expand Up @@ -32,6 +33,8 @@ def connect_database(url):
redis+taskdb://host:port/db
elasticsearch:
elasticsearch+type://host:port/?index=pyspider
couchdb:
couchdb+type://[username:password@]host[:port]
local:
local+projectdb://filepath,filepath
Expand Down Expand Up @@ -89,6 +92,9 @@ def _connect_database(url): # NOQA
elif engine == 'elasticsearch' or engine == 'es':
return _connect_elasticsearch(parsed, dbtype)

elif engine == 'couchdb':
return _connect_couchdb(parsed, dbtype, url)

else:
raise Exception('unknown engine: %s' % engine)

Expand Down Expand Up @@ -198,3 +204,45 @@ def _connect_elasticsearch(parsed, dbtype):
elif dbtype == 'taskdb':
from .elasticsearch.taskdb import TaskDB
return TaskDB([parsed.netloc], index=index)


def _connect_couchdb(parsed, dbtype, url):
if os.environ.get('COUCHDB_HTTPS'):
url = "https://" + parsed.netloc + "/"
else:
url = "http://" + parsed.netloc + "/"
params = {}

username = None
password = None
if '@' in parsed.netloc:
# netloc looks like: 'user:pass@couchdb:999'
url = parsed.netloc[parsed.netloc.find("@")+1:]
# extract the username and password
username = parsed.netloc[:parsed.netloc.find(":")]
password = parsed.netloc[parsed.netloc.find(":")+1:parsed.netloc.find("@")]

# default to env, then url, then hard coded
params['username'] = os.environ.get('COUCHDB_USER') or username or 'user'
params['password'] = os.environ.get('COUCHDB_PASSWORD') or password or 'password'

# create required CouchDB databases if not already present
requests.put(url+"_users")
requests.put(url+"_replicator")

# create the admin user
# NOTE: Over docker, this user is already created when COUCHDB_USER and COUCHDB_PASSWORD are set
requests.put(url+'_node/_local/_config/admins/'+ params['username'],
data=params['password'])

if dbtype == 'taskdb':
from .couchdb.taskdb import TaskDB
return TaskDB(url, **params)
elif dbtype == 'projectdb':
from .couchdb.projectdb import ProjectDB
return ProjectDB(url, **params)
elif dbtype == 'resultdb':
from .couchdb.resultdb import ResultDB
return ResultDB(url, **params)
else:
raise LookupError
Empty file.

0 comments on commit dd0a72f

Please sign in to comment.