diff --git a/docs/conf.py b/docs/conf.py index e53f785fe..dc7e6ece3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,7 +12,7 @@ class Mock(MagicMock): @classmethod def __getattr__(cls, name): - return Mock() + return Mock() MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) diff --git a/pyspider/database/__init__.py b/pyspider/database/__init__.py index 04755b904..a21638257 100644 --- a/pyspider/database/__init__.py +++ b/pyspider/database/__init__.py @@ -5,7 +5,9 @@ # http://binux.me # Created on 2014-10-08 15:04:08 -import os, requests, json +import os +import json +import requests from six.moves.urllib.parse import urlparse, parse_qs @@ -55,9 +57,8 @@ def _connect_database(url): # NOQA scheme = parsed.scheme.split('+') if len(scheme) == 1: raise Exception('wrong scheme format: %s' % parsed.scheme) - else: - engine, dbtype = scheme[0], scheme[-1] - other_scheme = "+".join(scheme[1:-1]) + engine, dbtype = scheme[0], scheme[-1] + other_scheme = "+".join(scheme[1:-1]) if dbtype not in ('taskdb', 'projectdb', 'resultdb'): raise LookupError('unknown database type: %s, ' @@ -65,38 +66,29 @@ def _connect_database(url): # NOQA if engine == 'mysql': return _connect_mysql(parsed,dbtype) - - elif engine == 'sqlite': + if engine == 'sqlite': return _connect_sqlite(parsed,dbtype) - elif engine == 'mongodb': + if engine == 'mongodb': return _connect_mongodb(parsed,dbtype,url) - - elif engine == 'sqlalchemy': + if engine == 'sqlalchemy': return _connect_sqlalchemy(parsed, dbtype, url, other_scheme) - - - elif engine == 'redis': + if engine == 'redis': if dbtype == 'taskdb': from .redis.taskdb import TaskDB return TaskDB(parsed.hostname, parsed.port, int(parsed.path.strip('/') or 0)) - else: - raise LookupError('not supported dbtype: %s', dbtype) - elif engine == 'local': + raise LookupError('not supported dbtype: %s', dbtype) + if engine == 'local': scripts = url.split('//', 1)[1].split(',') if dbtype == 'projectdb': from .local.projectdb import ProjectDB return ProjectDB(scripts) - else: - raise LookupError('not supported dbtype: %s', dbtype) - elif engine == 'elasticsearch' or engine == 'es': + raise LookupError('not supported dbtype: %s', dbtype) + if engine == 'elasticsearch' or engine == 'es': return _connect_elasticsearch(parsed, dbtype) - - elif engine == 'couchdb': + if engine == 'couchdb': return _connect_couchdb(parsed, dbtype, url) - - else: - raise Exception('unknown engine: %s' % engine) + raise Exception('unknown engine: %s' % engine) def _connect_mysql(parsed,dbtype): @@ -115,14 +107,13 @@ def _connect_mysql(parsed,dbtype): if dbtype == 'taskdb': from .mysql.taskdb import TaskDB return TaskDB(**parames) - elif dbtype == 'projectdb': + if dbtype == 'projectdb': from .mysql.projectdb import ProjectDB return ProjectDB(**parames) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .mysql.resultdb import ResultDB return ResultDB(**parames) - else: - raise LookupError + raise LookupError def _connect_sqlite(parsed,dbtype): @@ -138,14 +129,13 @@ def _connect_sqlite(parsed,dbtype): if dbtype == 'taskdb': from .sqlite.taskdb import TaskDB return TaskDB(path) - elif dbtype == 'projectdb': + if dbtype == 'projectdb': from .sqlite.projectdb import ProjectDB return ProjectDB(path) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .sqlite.resultdb import ResultDB return ResultDB(path) - else: - raise LookupError + raise LookupError def _connect_mongodb(parsed,dbtype,url): @@ -157,14 +147,13 @@ def _connect_mongodb(parsed,dbtype,url): if dbtype == 'taskdb': from .mongodb.taskdb import TaskDB return TaskDB(url, **parames) - elif dbtype == 'projectdb': + if dbtype == 'projectdb': from .mongodb.projectdb import ProjectDB return ProjectDB(url, **parames) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .mongodb.resultdb import ResultDB return ResultDB(url, **parames) - else: - raise LookupError + raise LookupError def _connect_sqlalchemy(parsed, dbtype,url, other_scheme): @@ -174,14 +163,13 @@ def _connect_sqlalchemy(parsed, dbtype,url, other_scheme): if dbtype == 'taskdb': from .sqlalchemy.taskdb import TaskDB return TaskDB(url) - elif dbtype == 'projectdb': + if dbtype == 'projectdb': from .sqlalchemy.projectdb import ProjectDB return ProjectDB(url) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .sqlalchemy.resultdb import ResultDB return ResultDB(url) - else: - raise LookupError + raise LookupError def _connect_elasticsearch(parsed, dbtype): @@ -198,10 +186,10 @@ def _connect_elasticsearch(parsed, dbtype): if dbtype == 'projectdb': from .elasticsearch.projectdb import ProjectDB return ProjectDB([parsed.netloc], index=index) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .elasticsearch.resultdb import ResultDB return ResultDB([parsed.netloc], index=index) - elif dbtype == 'taskdb': + if dbtype == 'taskdb': from .elasticsearch.taskdb import TaskDB return TaskDB([parsed.netloc], index=index) @@ -220,11 +208,10 @@ def _connect_couchdb(parsed, dbtype, url): if dbtype == 'taskdb': from .couchdb.taskdb import TaskDB return TaskDB(url, **params) - elif dbtype == 'projectdb': + if dbtype == 'projectdb': from .couchdb.projectdb import ProjectDB return ProjectDB(url, **params) - elif dbtype == 'resultdb': + if dbtype == 'resultdb': from .couchdb.resultdb import ResultDB return ResultDB(url, **params) - else: - raise LookupError + raise LookupError diff --git a/pyspider/database/base/projectdb.py b/pyspider/database/base/projectdb.py index 7f02c7426..e0e920834 100644 --- a/pyspider/database/base/projectdb.py +++ b/pyspider/database/base/projectdb.py @@ -25,7 +25,7 @@ } -class ProjectDB(object): +class ProjectDB(): status_str = [ 'TODO', 'STOP', @@ -55,8 +55,7 @@ def check_update(self, timestamp, fields=None): def split_group(self, group, lower=True): if lower: return re.split("\W+", (group or '').lower()) - else: - return re.split("\W+", group or '') + return re.split("\W+", group or '') def verify_project_name(self, name): if len(name) > 64: diff --git a/pyspider/database/base/resultdb.py b/pyspider/database/base/resultdb.py index aa29afd35..c80bff6de 100644 --- a/pyspider/database/base/resultdb.py +++ b/pyspider/database/base/resultdb.py @@ -17,7 +17,7 @@ } -class ResultDB(object): +class ResultDB(): """ database for result """ diff --git a/pyspider/database/base/taskdb.py b/pyspider/database/base/taskdb.py index b698a8210..d9703a9e7 100644 --- a/pyspider/database/base/taskdb.py +++ b/pyspider/database/base/taskdb.py @@ -56,7 +56,7 @@ } -class TaskDB(object): +class TaskDB(): ACTIVE = 1 SUCCESS = 2 FAILED = 3 diff --git a/pyspider/database/basedb.py b/pyspider/database/basedb.py index ca71d6d2c..b471220d5 100644 --- a/pyspider/database/basedb.py +++ b/pyspider/database/basedb.py @@ -8,10 +8,9 @@ from __future__ import unicode_literals, division, absolute_import import logging -logger = logging.getLogger('database.basedb') - from six import itervalues from pyspider.libs import utils +logger = logging.getLogger('database.basedb') class BaseDB: diff --git a/pyspider/database/couchdb/couchdbbase.py b/pyspider/database/couchdb/couchdbbase.py index 13eb7fb57..244bf165a 100644 --- a/pyspider/database/couchdb/couchdbbase.py +++ b/pyspider/database/couchdb/couchdbbase.py @@ -1,7 +1,9 @@ -import time, requests, json +import time + +import requests from requests.auth import HTTPBasicAuth -class SplitTableMixin(object): +class SplitTableMixin(): UPDATE_PROJECTS_TIME = 10 * 60 def __init__(self): @@ -13,8 +15,7 @@ def __init__(self): def _collection_name(self, project): if self.collection_prefix: return "%s_%s" % (self.collection_prefix, project) - else: - return project + return project @property @@ -92,4 +93,3 @@ def update_doc(self, db_name, doc_id, new_doc): def delete(self, url): return self.session.delete(url).json() - diff --git a/pyspider/database/couchdb/projectdb.py b/pyspider/database/couchdb/projectdb.py index 17c1f6ff3..e0201f3d1 100644 --- a/pyspider/database/couchdb/projectdb.py +++ b/pyspider/database/couchdb/projectdb.py @@ -1,4 +1,5 @@ -import time, requests, json +import time +import requests from requests.auth import HTTPBasicAuth from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB diff --git a/pyspider/database/couchdb/resultdb.py b/pyspider/database/couchdb/resultdb.py index 163a6c17b..f9fde979f 100644 --- a/pyspider/database/couchdb/resultdb.py +++ b/pyspider/database/couchdb/resultdb.py @@ -1,4 +1,4 @@ -import time, json +import time from pyspider.database.base.resultdb import ResultDB as BaseResultDB from .couchdbbase import SplitTableMixin @@ -105,4 +105,4 @@ def drop(self, project): # drop the project collection_name = self._get_collection_name(project) url = self.base_url + collection_name - return self.delete(url) \ No newline at end of file + return self.delete(url) diff --git a/pyspider/database/couchdb/taskdb.py b/pyspider/database/couchdb/taskdb.py index 9110be82a..ae9b8afe3 100644 --- a/pyspider/database/couchdb/taskdb.py +++ b/pyspider/database/couchdb/taskdb.py @@ -1,4 +1,4 @@ -import json, time +import time from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from .couchdbbase import SplitTableMixin @@ -107,4 +107,4 @@ def drop_database(self): def drop(self, project): collection_name = self._get_collection_name(project) url = self.base_url + collection_name - return self.delete(url) \ No newline at end of file + return self.delete(url) diff --git a/pyspider/database/local/projectdb.py b/pyspider/database/local/projectdb.py index 835fe5a56..22dafb13b 100644 --- a/pyspider/database/local/projectdb.py +++ b/pyspider/database/local/projectdb.py @@ -7,9 +7,10 @@ import os import re -import six import glob import logging +import six + from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB @@ -44,7 +45,7 @@ def load_scripts(self): def _build_project(self, filename): try: - with open(filename) as fp: + with open(filename, encoding='utf-8') as fp: script = fp.read() m = self.rate_re.search(script) if m: diff --git a/pyspider/database/mongodb/mongodbbase.py b/pyspider/database/mongodb/mongodbbase.py index 5815904b3..bd6b248d9 100644 --- a/pyspider/database/mongodb/mongodbbase.py +++ b/pyspider/database/mongodb/mongodbbase.py @@ -8,14 +8,13 @@ import time -class SplitTableMixin(object): +class SplitTableMixin(): UPDATE_PROJECTS_TIME = 10 * 60 def _collection_name(self, project): if self.collection_prefix: return "%s.%s" % (self.collection_prefix, project) - else: - return project + return project @property def projects(self): diff --git a/pyspider/database/mysql/mysqlbase.py b/pyspider/database/mysql/mysqlbase.py index 9dfc1aa0e..a23afc63b 100644 --- a/pyspider/database/mysql/mysqlbase.py +++ b/pyspider/database/mysql/mysqlbase.py @@ -9,7 +9,7 @@ import mysql.connector -class MySQLMixin(object): +class MySQLMixin(): maxlimit = 18446744073709551615 @property @@ -26,14 +26,13 @@ def dbcur(self): return self.conn.cursor() -class SplitTableMixin(object): +class SplitTableMixin(): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) - else: - return project + return project @property def projects(self): diff --git a/pyspider/database/mysql/resultdb.py b/pyspider/database/mysql/resultdb.py index 3fb50b68f..e1f8cb968 100644 --- a/pyspider/database/mysql/resultdb.py +++ b/pyspider/database/mysql/resultdb.py @@ -6,9 +6,9 @@ # Created on 2014-10-13 22:02:57 import re -import six import time import json +import six import mysql.connector from pyspider.libs import utils diff --git a/pyspider/database/mysql/taskdb.py b/pyspider/database/mysql/taskdb.py index 90e97a8ac..8764f8245 100644 --- a/pyspider/database/mysql/taskdb.py +++ b/pyspider/database/mysql/taskdb.py @@ -7,9 +7,10 @@ import re -import six import time import json +import six + import mysql.connector from pyspider.libs import utils @@ -97,7 +98,7 @@ def get_task(self, project, taskid, fields=None): return None def status_count(self, project): - result = dict() + result = {} if project not in self.projects: self._list_project() if project not in self.projects: diff --git a/pyspider/database/redis/__init__.py b/pyspider/database/redis/__init__.py index 181c4e734..043e3b3f3 100644 --- a/pyspider/database/redis/__init__.py +++ b/pyspider/database/redis/__init__.py @@ -4,4 +4,3 @@ # Author: Binux # http://binux.me # Created on 2015-05-17 01:34:21 - diff --git a/pyspider/database/redis/taskdb.py b/pyspider/database/redis/taskdb.py index c6125b6ea..b9de5a340 100644 --- a/pyspider/database/redis/taskdb.py +++ b/pyspider/database/redis/taskdb.py @@ -5,12 +5,12 @@ # http://binux.me # Created on 2015-05-16 21:01:52 -import six import time import json -import redis import logging import itertools +import redis +import six from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB @@ -99,8 +99,7 @@ def get_method(key): if not obj: #self.redis.srem(status_key, taskid) continue - else: - yield self._parse(obj) + yield self._parse(obj) def get_task(self, project, taskid, fields=None): if fields: diff --git a/pyspider/database/sqlalchemy/__init__.py b/pyspider/database/sqlalchemy/__init__.py index d0548d60e..204a4da10 100644 --- a/pyspider/database/sqlalchemy/__init__.py +++ b/pyspider/database/sqlalchemy/__init__.py @@ -4,4 +4,3 @@ # Author: Binux # http://binux.me # Created on 2014-12-04 20:11:04 - diff --git a/pyspider/database/sqlalchemy/projectdb.py b/pyspider/database/sqlalchemy/projectdb.py index 18e323c1d..c4be1fd1e 100644 --- a/pyspider/database/sqlalchemy/projectdb.py +++ b/pyspider/database/sqlalchemy/projectdb.py @@ -5,13 +5,11 @@ # http://binux.me # Created on 2014-12-04 23:25:10 -import six import time import sqlalchemy.exc from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text from sqlalchemy.engine.url import make_url -from pyspider.libs import utils from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from .sqlalchemybase import result2dict diff --git a/pyspider/database/sqlalchemy/resultdb.py b/pyspider/database/sqlalchemy/resultdb.py index 8f91f6b49..e9c7c0a9f 100644 --- a/pyspider/database/sqlalchemy/resultdb.py +++ b/pyspider/database/sqlalchemy/resultdb.py @@ -6,9 +6,9 @@ # Created on 2014-12-04 18:48:15 import re -import six import time import json +import six import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, @@ -93,9 +93,8 @@ def save(self, project, taskid, url, result): return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) .values(**self._stringify(obj))) - else: - return self.engine.execute(self.table.insert() - .values(**self._stringify(obj))) + return self.engine.execute(self.table.insert() + .values(**self._stringify(obj))) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: diff --git a/pyspider/database/sqlalchemy/sqlalchemybase.py b/pyspider/database/sqlalchemy/sqlalchemybase.py index 8fc100d21..411633542 100644 --- a/pyspider/database/sqlalchemy/sqlalchemybase.py +++ b/pyspider/database/sqlalchemy/sqlalchemybase.py @@ -12,14 +12,13 @@ def result2dict(columns, task): return dict(task) -class SplitTableMixin(object): +class SplitTableMixin(): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) - else: - return project + return project @property def projects(self): diff --git a/pyspider/database/sqlalchemy/taskdb.py b/pyspider/database/sqlalchemy/taskdb.py index b298d608b..04834f8ce 100644 --- a/pyspider/database/sqlalchemy/taskdb.py +++ b/pyspider/database/sqlalchemy/taskdb.py @@ -6,9 +6,10 @@ # Created on 2014-12-04 22:33:43 import re -import six import time import json +import six + import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, Index, @@ -118,7 +119,7 @@ def get_task(self, project, taskid, fields=None): return self._parse(result2dict(columns, each)) def status_count(self, project): - result = dict() + result = {} if project not in self.projects: self._list_project() if project not in self.projects: diff --git a/pyspider/database/sqlite/projectdb.py b/pyspider/database/sqlite/projectdb.py index 282ce5305..de8f92491 100644 --- a/pyspider/database/sqlite/projectdb.py +++ b/pyspider/database/sqlite/projectdb.py @@ -7,9 +7,10 @@ import time -from .sqlitebase import SQLiteMixin from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SQLiteMixin + class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): diff --git a/pyspider/database/sqlite/resultdb.py b/pyspider/database/sqlite/resultdb.py index 0314eaf2d..5b51123f9 100644 --- a/pyspider/database/sqlite/resultdb.py +++ b/pyspider/database/sqlite/resultdb.py @@ -9,9 +9,9 @@ import time import json -from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SQLiteMixin, SplitTableMixin class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): diff --git a/pyspider/database/sqlite/sqlitebase.py b/pyspider/database/sqlite/sqlitebase.py index 9a652b9f7..0a64d594b 100644 --- a/pyspider/database/sqlite/sqlitebase.py +++ b/pyspider/database/sqlite/sqlitebase.py @@ -11,7 +11,7 @@ import threading -class SQLiteMixin(object): +class SQLiteMixin(): @property def dbcur(self): @@ -22,14 +22,13 @@ def dbcur(self): return self.conn.cursor() -class SplitTableMixin(object): +class SplitTableMixin(): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) - else: - return project + return project @property def projects(self): diff --git a/pyspider/database/sqlite/taskdb.py b/pyspider/database/sqlite/taskdb.py index 5a0095d5a..9fab15ffa 100644 --- a/pyspider/database/sqlite/taskdb.py +++ b/pyspider/database/sqlite/taskdb.py @@ -9,9 +9,10 @@ import time import json -from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SQLiteMixin, SplitTableMixin + class TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB): @@ -86,7 +87,7 @@ def status_count(self, project): ''' return a dict ''' - result = dict() + result = {} if project not in self.projects: self._list_project() if project not in self.projects: diff --git a/pyspider/fetcher/cookie_utils.py b/pyspider/fetcher/cookie_utils.py index e486fa8af..a57320c8a 100644 --- a/pyspider/fetcher/cookie_utils.py +++ b/pyspider/fetcher/cookie_utils.py @@ -8,7 +8,7 @@ from requests.cookies import MockRequest -class MockResponse(object): +class MockResponse(): def __init__(self, headers): self._headers = headers diff --git a/pyspider/fetcher/tornado_fetcher.py b/pyspider/fetcher/tornado_fetcher.py index d64169351..71b87d4b1 100644 --- a/pyspider/fetcher/tornado_fetcher.py +++ b/pyspider/fetcher/tornado_fetcher.py @@ -9,7 +9,6 @@ import os import sys -import six import copy import time import json @@ -17,19 +16,22 @@ import traceback import functools import threading +import six import tornado.ioloop import tornado.httputil import tornado.httpclient -import pyspider from six.moves import queue, http_cookies from six.moves.urllib.robotparser import RobotFileParser -from requests import cookies from six.moves.urllib.parse import urljoin, urlsplit +from requests import cookies + from tornado import gen from tornado.curl_httpclient import CurlAsyncHTTPClient from tornado.simple_httpclient import SimpleAsyncHTTPClient +import pyspider + from pyspider.libs import utils, dataurl, counter from pyspider.libs.url import quote_chinese from .cookie_utils import extract_cookies_to_jar @@ -63,7 +65,7 @@ def size(self): } -class Fetcher(object): +class Fetcher(): user_agent = "pyspider/%s (+http://pyspider.org/)" % pyspider.__version__ default_options = { 'method': 'GET', @@ -75,7 +77,7 @@ class Fetcher(object): } phantomjs_proxy = None splash_endpoint = None - splash_lua_source = open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua")).read() + splash_lua_source = open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua"), encoding='utf-8').read() robot_txt_age = 60*60 # 1h def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async_mode=True): @@ -116,8 +118,7 @@ def send_result(self, type, task, result): def fetch(self, task, callback=None): if self.async_mode: return self.async_fetch(task, callback) - else: - return self.async_fetch(task, callback).result() + return self.async_fetch(task, callback).result() @gen.coroutine def async_fetch(self, task, callback=None): diff --git a/pyspider/libs/ListIO.py b/pyspider/libs/ListIO.py index e48d42edd..b9d5bd4bb 100644 --- a/pyspider/libs/ListIO.py +++ b/pyspider/libs/ListIO.py @@ -6,7 +6,7 @@ # Created on 2014-02-26 23:41:51 -class ListO(object): +class ListO(): """A StringO write to list.""" diff --git a/pyspider/libs/base_handler.py b/pyspider/libs/base_handler.py index d2ebe9584..39745555a 100644 --- a/pyspider/libs/base_handler.py +++ b/pyspider/libs/base_handler.py @@ -61,7 +61,7 @@ def wrapper(func): return wrapper -class NOTSET(object): +class NOTSET(): pass @@ -120,7 +120,7 @@ def __new__(cls, name, bases, attrs): @add_metaclass(BaseHandlerMeta) -class BaseHandler(object): +class BaseHandler(): """ BaseHandler for all scripts. @@ -391,7 +391,7 @@ def crawl(self, url, **kwargs): if isinstance(url, six.string_types): return self._crawl(url, **kwargs) - elif hasattr(url, "__iter__"): + if hasattr(url, "__iter__"): result = [] for each in url: result.append(self._crawl(each, **kwargs)) diff --git a/pyspider/libs/bench.py b/pyspider/libs/bench.py index 9e7bfd6e9..c9400c24c 100644 --- a/pyspider/libs/bench.py +++ b/pyspider/libs/bench.py @@ -9,7 +9,6 @@ import time import logging -logger = logging.getLogger('bench') from six.moves import queue as Queue from pyspider.scheduler import ThreadBaseScheduler as Scheduler @@ -18,6 +17,8 @@ from pyspider.result import ResultWorker from pyspider.libs.utils import md5string +logger = logging.getLogger('bench') + def bench_test_taskdb(taskdb): project_name = '__bench_test__' @@ -187,7 +188,7 @@ def test_get(n): pass -class BenchMixin(object): +class BenchMixin(): """Report to logger for bench test""" def _bench_init(self): self.done_cnt = 0 @@ -212,42 +213,42 @@ def _bench_report(self, name, prefix=0, rjust=0): class BenchScheduler(Scheduler, BenchMixin): def __init__(self, *args, **kwargs): - super(BenchScheduler, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._bench_init() def on_task_status(self, task): self._bench_report('Crawled') - return super(BenchScheduler, self).on_task_status(task) + return super().on_task_status(task) class BenchFetcher(Fetcher, BenchMixin): def __init__(self, *args, **kwargs): - super(BenchFetcher, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._bench_init() def on_result(self, type, task, result): self._bench_report("Fetched", 0, 75) - return super(BenchFetcher, self).on_result(type, task, result) + return super().on_result(type, task, result) class BenchProcessor(Processor, BenchMixin): def __init__(self, *args, **kwargs): - super(BenchProcessor, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._bench_init() def on_task(self, task, response): self._bench_report("Processed", 75) - return super(BenchProcessor, self).on_task(task, response) + return super().on_task(task, response) class BenchResultWorker(ResultWorker, BenchMixin): def __init__(self, *args, **kwargs): - super(BenchResultWorker, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._bench_init() def on_result(self, task, result): self._bench_report("Saved", 0, 150) - super(BenchResultWorker, self).on_result(task, result) + super().on_result(task, result) from pyspider.libs.base_handler import BaseHandler diff --git a/pyspider/libs/counter.py b/pyspider/libs/counter.py index 88ff60eeb..265ee889e 100644 --- a/pyspider/libs/counter.py +++ b/pyspider/libs/counter.py @@ -13,14 +13,14 @@ try: from UserDict import DictMixin except ImportError: - from collections import Mapping as DictMixin + from collections.abc import Mapping as DictMixin import six from six import iteritems from six.moves import cPickle -class BaseCounter(object): +class BaseCounter(): def __init__(self): pass @@ -52,7 +52,7 @@ class TotalCounter(BaseCounter): """Total counter""" def __init__(self): - super(TotalCounter, self).__init__() + super().__init__() self.cnt = 0 def event(self, value=1): @@ -79,7 +79,7 @@ class AverageWindowCounter(BaseCounter): """ def __init__(self, window_size=300): - super(AverageWindowCounter, self).__init__() + super().__init__() self.window_size = window_size self.values = deque(maxlen=window_size) @@ -109,7 +109,7 @@ class TimebaseAverageEventCounter(BaseCounter): """ def __init__(self, window_size=30, window_interval=10): - super(TimebaseAverageEventCounter, self).__init__() + super().__init__() self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval @@ -195,7 +195,7 @@ class TimebaseAverageWindowCounter(BaseCounter): """ def __init__(self, window_size=30, window_interval=10): - super(TimebaseAverageWindowCounter, self).__init__() + super().__init__() self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval @@ -278,8 +278,7 @@ def __getitem__(self, key): if key == '__value__': key = self._keys return self.manager.counters[key] - else: - key = self._keys + (key, ) + key = self._keys + (key, ) available_keys = [] for _key in list(self.manager.counters.keys()): @@ -288,13 +287,11 @@ def __getitem__(self, key): if len(available_keys) == 0: raise KeyError - elif len(available_keys) == 1: + if len(available_keys) == 1: if available_keys[0] == key: return self.manager.counters.get(key) - else: - return CounterValue(self.manager, key) - else: return CounterValue(self.manager, key) + return CounterValue(self.manager, key) def __len__(self): return len(self.keys()) @@ -378,13 +375,11 @@ def __getitem__(self, key): if len(available_keys) == 0: raise KeyError - elif len(available_keys) == 1: + if len(available_keys) == 1: if available_keys[0] == key: return self.counters.get(key) - else: - return CounterValue(self, key) - else: return CounterValue(self, key) + return CounterValue(self, key) def __delitem__(self, key): key = (key, ) @@ -435,7 +430,7 @@ def load(self, filename): try: with open(filename, 'rb') as fp: self.counters = cPickle.load(fp) - except: + except Exception: logging.debug("can't load counter from file: %s", filename) return False return True diff --git a/pyspider/libs/dataurl.py b/pyspider/libs/dataurl.py index 3f75095e4..47d84a20e 100644 --- a/pyspider/libs/dataurl.py +++ b/pyspider/libs/dataurl.py @@ -5,10 +5,11 @@ # http://binux.me # Created on 2012-11-16 10:33:20 -import six from base64 import b64encode, b64decode -from . import utils +import six from six.moves.urllib.parse import quote, unquote +from . import utils + def encode(data, mime_type='', charset='utf-8', base64=True): diff --git a/pyspider/libs/log.py b/pyspider/libs/log.py index 770ff20f4..d2b468792 100644 --- a/pyspider/libs/log.py +++ b/pyspider/libs/log.py @@ -15,12 +15,12 @@ from tornado.log import LogFormatter as _LogFormatter -class LogFormatter(_LogFormatter, object): +class LogFormatter(_LogFormatter): """Init tornado.log.LogFormatter from logging.config.fileConfig""" def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): if fmt is None: fmt = _LogFormatter.DEFAULT_FORMAT - super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs) + super().__init__(color=color, fmt=fmt, *args, **kwargs) class SaveLogHandler(logging.Handler): diff --git a/pyspider/libs/multiprocessing_queue.py b/pyspider/libs/multiprocessing_queue.py index 96525225e..b627035ab 100644 --- a/pyspider/libs/multiprocessing_queue.py +++ b/pyspider/libs/multiprocessing_queue.py @@ -1,4 +1,3 @@ -import six import platform import multiprocessing from multiprocessing.queues import Queue as BaseQueue @@ -7,7 +6,7 @@ # The SharedCounter and Queue classes come from: # https://github.com/vterron/lemon/commit/9ca6b4b -class SharedCounter(object): +class SharedCounter(): """ A synchronized shared counter. The locking done by multiprocessing.Value ensures that only a single process or thread may read or write the in-memory ctypes object. However, @@ -45,15 +44,15 @@ class MultiProcessingQueue(BaseQueue): qsize() and empty(). """ def __init__(self, *args, **kwargs): - super(MultiProcessingQueue, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.size = SharedCounter(0) def put(self, *args, **kwargs): self.size.increment(1) - super(MultiProcessingQueue, self).put(*args, **kwargs) + super().put(*args, **kwargs) def get(self, *args, **kwargs): - v = super(MultiProcessingQueue, self).get(*args, **kwargs) + v = super().get(*args, **kwargs) self.size.increment(-1) return v diff --git a/pyspider/libs/pprint.py b/pyspider/libs/pprint.py index 0ff21716e..63596c53d 100644 --- a/pyspider/libs/pprint.py +++ b/pyspider/libs/pprint.py @@ -36,10 +36,11 @@ from __future__ import print_function -import six + import sys as _sys from io import BytesIO, StringIO +import six __all__ = ["pprint", "pformat", "isreadable", "isrecursive", "saferepr", "PrettyPrinter"] @@ -268,7 +269,7 @@ def _safe_repr(object, context, maxlevels, level): try: string.decode('utf8').encode('gbk', 'replace') return ("%s%s%s" % (closure, string, closure)), True, False - except: + except Exception: pass qget = quotes.get sio = StringIO() diff --git a/pyspider/libs/response.py b/pyspider/libs/response.py index 8975781b2..ae2a4a248 100644 --- a/pyspider/libs/response.py +++ b/pyspider/libs/response.py @@ -7,8 +7,9 @@ import cgi import re -import six import json +import six + import chardet import lxml.html import lxml.etree @@ -19,7 +20,7 @@ from pyspider.libs import utils -class Response(object): +class Response(): def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(), content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0): @@ -38,7 +39,7 @@ def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsens self.time = time def __repr__(self): - return u'' % self.status_code + return '' % self.status_code def __bool__(self): """Returns true if `status_code` is 200 and no error""" @@ -53,7 +54,7 @@ def ok(self): """Return true if `status_code` is 200 and no error.""" try: self.raise_for_status() - except: + except Exception: return False return True @@ -105,7 +106,7 @@ def text(self): if hasattr(self, '_text') and self._text: return self._text if not self.content: - return u'' + return '' if isinstance(self.content, six.text_type): return self.content @@ -167,7 +168,7 @@ def raise_for_status(self, allow_redirects=True): if self.status_code == 304: return - elif self.error: + if self.error: if self.traceback: six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback()) http_error = HTTPError(self.error) @@ -187,7 +188,7 @@ def isok(self): try: self.raise_for_status() return True - except: + except Exception: return False diff --git a/pyspider/libs/result_dump.py b/pyspider/libs/result_dump.py index 5e7dd45a6..29d4e8c77 100644 --- a/pyspider/libs/result_dump.py +++ b/pyspider/libs/result_dump.py @@ -5,11 +5,11 @@ # http://binux.me # Created on 2015-03-27 20:12:11 -import six import csv import json import itertools from io import StringIO, BytesIO +import six from six import iteritems @@ -74,18 +74,14 @@ def toString(obj): if isinstance(obj, six.binary_type): if six.PY2: return obj - else: - return obj.decode('utf8') - elif isinstance(obj, six.text_type): + return obj.decode('utf8') + if isinstance(obj, six.text_type): if six.PY2: return obj.encode('utf8') - else: - return obj - else: - if six.PY2: - return json.dumps(obj, ensure_ascii=False).encode('utf8') - else: - return json.dumps(obj, ensure_ascii=False) + return obj + if six.PY2: + return json.dumps(obj, ensure_ascii=False).encode('utf8') + return json.dumps(obj, ensure_ascii=False) # python2 needs byes when python3 needs unicode if six.PY2: diff --git a/pyspider/libs/url.py b/pyspider/libs/url.py index c1c99a59f..8d82b85f4 100644 --- a/pyspider/libs/url.py +++ b/pyspider/libs/url.py @@ -7,8 +7,8 @@ import mimetypes -import six import shlex +import six from six.moves.urllib.parse import urlparse, urlunparse from requests.models import RequestEncodingMixin @@ -94,7 +94,7 @@ def curl_to_arguments(curl): # option if current_opt is None: raise TypeError('Unknow curl argument: %s' % part) - elif current_opt in ('-H', '--header'): + if current_opt in ('-H', '--header'): key_value = part.split(':', 1) if len(key_value) == 2: key, value = key_value diff --git a/pyspider/libs/utils.py b/pyspider/libs/utils.py index 336021a03..3961d98ee 100644 --- a/pyspider/libs/utils.py +++ b/pyspider/libs/utils.py @@ -31,7 +31,7 @@ def getitem(obj, key=0, default=None): """Get first element of list or return default""" try: return obj[key] - except: + except Exception: return default @@ -111,8 +111,7 @@ def format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=Fa format = fff_format if ret_: return format - else: - format = format + format = format if format is None: format = "%(month_name)s %(day)s, %(year)s" if shorter else \ @@ -218,10 +217,9 @@ def utf8(string): """ if isinstance(string, six.text_type): return string.encode('utf8') - elif isinstance(string, six.binary_type): + if isinstance(string, six.binary_type): return string - else: - return six.text_type(string).encode('utf8') + return six.text_type(string).encode('utf8') def text(string, encoding='utf8'): @@ -232,10 +230,9 @@ def text(string, encoding='utf8'): """ if isinstance(string, six.text_type): return string - elif isinstance(string, six.binary_type): + if isinstance(string, six.binary_type): return string.decode(encoding) - else: - return six.text_type(string) + return six.text_type(string) def pretty_unicode(string): @@ -289,19 +286,18 @@ def unicode_obj(obj): """ if isinstance(obj, dict): return unicode_dict(obj) - elif isinstance(obj, (list, tuple)): + if isinstance(obj, (list, tuple)): return unicode_list(obj) - elif isinstance(obj, six.string_types): + if isinstance(obj, six.string_types): return unicode_string(obj) - elif isinstance(obj, (int, float)): + if isinstance(obj, (int, float)): return obj - elif obj is None: + if obj is None: return obj - else: - try: - return text(obj) - except: - return text(repr(obj)) + try: + return text(obj) + except Exception: + return text(repr(obj)) def decode_unicode_string(string): @@ -322,15 +318,14 @@ def decode_unicode_obj(obj): for k, v in iteritems(obj): r[decode_unicode_string(k)] = decode_unicode_obj(v) return r - elif isinstance(obj, six.string_types): + if isinstance(obj, six.string_types): return decode_unicode_string(obj) - elif isinstance(obj, (list, tuple)): + if isinstance(obj, (list, tuple)): return [decode_unicode_obj(x) for x in obj] - else: - return obj + return obj -class Get(object): +class Get(): """ Lazy value calculate for object """ @@ -434,7 +429,4 @@ def python_console(namespace=None): def check_port_open(port, addr='127.0.0.1'): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: result = sock.connect_ex((addr, port)) - if result == 0: - return True - else: - return False + return result == 0 diff --git a/pyspider/libs/wsgi_xmlrpc.py b/pyspider/libs/wsgi_xmlrpc.py index 37b6eafa4..3654b118b 100644 --- a/pyspider/libs/wsgi_xmlrpc.py +++ b/pyspider/libs/wsgi_xmlrpc.py @@ -15,13 +15,14 @@ # Origin: https://code.google.com/p/wsgi-xmlrpc/ -from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher import logging +from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher + logger = logging.getLogger(__name__) -class WSGIXMLRPCApplication(object): +class WSGIXMLRPCApplication(): """Application to handle requests to the XMLRPC service""" def __init__(self, instance=None, methods=None): @@ -50,9 +51,8 @@ def handler(self, environ, start_response): if environ['REQUEST_METHOD'] == 'POST': return self.handle_POST(environ, start_response) - else: - start_response("400 Bad request", [('Content-Type', 'text/plain')]) - return [''] + start_response("400 Bad request", [('Content-Type', 'text/plain')]) + return [''] def handle_POST(self, environ, start_response): """Handles the HTTP POST request. diff --git a/pyspider/message_queue/__init__.py b/pyspider/message_queue/__init__.py index 86592f6fb..2ec7a3b7d 100644 --- a/pyspider/message_queue/__init__.py +++ b/pyspider/message_queue/__init__.py @@ -41,7 +41,7 @@ def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) - elif parsed.scheme == 'redis': + if parsed.scheme == 'redis': from .redis_queue import Queue if ',' in parsed.netloc: """ @@ -53,21 +53,18 @@ def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes) + db = parsed.path.lstrip('/').split('/') + try: + db = int(db[0]) + except Exception: + logging.warning('redis DB must zero-based numeric index, using 0 instead') + db = 0 - else: - db = parsed.path.lstrip('/').split('/') - try: - db = int(db[0]) - except: - logging.warning('redis DB must zero-based numeric index, using 0 instead') - db = 0 + password = parsed.password or None - password = parsed.password or None - - return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) - elif url.startswith('kombu+'): + return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) + if url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) - else: - raise Exception('unknown connection url: %s', url) + raise Exception('unknown connection url: %s', url) diff --git a/pyspider/message_queue/kombu_queue.py b/pyspider/message_queue/kombu_queue.py index e16f7b8c0..035e80ff1 100644 --- a/pyspider/message_queue/kombu_queue.py +++ b/pyspider/message_queue/kombu_queue.py @@ -17,7 +17,7 @@ enable_insecure_serializers(['umsgpack']) -class KombuQueue(object): +class KombuQueue(): """ kombu is a high-level interface for multiple message queue backends. @@ -57,14 +57,10 @@ def qsize(self): def empty(self): if self.qsize() == 0: return True - else: - return False + return False def full(self): - if self.maxsize and self.qsize() >= self.maxsize: - return True - else: - return False + return self.maxsize and self.qsize() >= self.maxsize def put(self, obj, block=True, timeout=None): if not block: diff --git a/pyspider/message_queue/rabbitmq.py b/pyspider/message_queue/rabbitmq.py index 9e4e72595..4af81ef6d 100644 --- a/pyspider/message_queue/rabbitmq.py +++ b/pyspider/message_queue/rabbitmq.py @@ -9,15 +9,16 @@ import socket import select import logging -import umsgpack import threading +import umsgpack import amqp -from six.moves.urllib.parse import unquote + try: from urllib import parse as urlparse except ImportError: import urlparse +from six.moves.urllib.parse import unquote from six.moves import queue as BaseQueue @@ -49,7 +50,7 @@ def wrap(self, *args, **kwargs): return wrap -class PikaQueue(object): +class PikaQueue(): """ A Queue like rabbitmq connector """ @@ -111,14 +112,12 @@ def qsize(self): def empty(self): if self.qsize() == 0: return True - else: - return False + return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True - else: - return False + return False @catch_error def put(self, obj, block=True, timeout=None): diff --git a/pyspider/message_queue/redis_queue.py b/pyspider/message_queue/redis_queue.py index dc24924c1..3c47d78ac 100644 --- a/pyspider/message_queue/redis_queue.py +++ b/pyspider/message_queue/redis_queue.py @@ -11,7 +11,7 @@ from six.moves import queue as BaseQueue -class RedisQueue(object): +class RedisQueue(): """ A Queue like message built over redis """ @@ -31,7 +31,7 @@ def __init__(self, name, host='localhost', port=6379, db=0, for better performance. """ self.name = name - if(cluster_nodes is not None): + if cluster_nodes is not None: from rediscluster import StrictRedisCluster self.redis = StrictRedisCluster(startup_nodes=cluster_nodes) else: @@ -47,14 +47,12 @@ def qsize(self): def empty(self): if self.qsize() == 0: return True - else: - return False + return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True - else: - return False + return False def put_nowait(self, obj): if self.lazy_limit and self.last_qsize < self.maxsize: diff --git a/pyspider/processor/processor.py b/pyspider/processor/processor.py index ae0de1f46..0a919d7dc 100644 --- a/pyspider/processor/processor.py +++ b/pyspider/processor/processor.py @@ -6,21 +6,21 @@ # Created on 2014-02-16 22:59:56 import sys -import six import time import logging import traceback -logger = logging.getLogger("processor") - from six.moves import queue as Queue +import six from pyspider.libs import utils from pyspider.libs.log import LogFormatter from pyspider.libs.utils import pretty_unicode, hide_me from pyspider.libs.response import rebuild_response from .project_module import ProjectManager, ProjectFinder +logger = logging.getLogger("processor") + -class ProcessorResult(object): +class ProcessorResult(): """The result and logs producted by a callback""" def __init__(self, result=None, follows=(), messages=(), @@ -55,11 +55,11 @@ def logstr(self): tb = hide_me(tb, globals()) record.exc_info = a, b, tb result.append(pretty_unicode(formater.format(record))) - result.append(u'\n') - return u''.join(result) + result.append('\n') + return ''.join(result) -class Processor(object): +class Processor(): PROCESS_TIME_LIMIT = 30 EXCEPTION_LIMIT = 3 diff --git a/pyspider/processor/project_module.py b/pyspider/processor/project_module.py index 7adfe708c..ae80c282a 100644 --- a/pyspider/processor/project_module.py +++ b/pyspider/processor/project_module.py @@ -6,21 +6,22 @@ # Created on 2014-02-16 22:24:20 import os -import six import sys -import imp +import importlib import time import weakref import logging import inspect import traceback import linecache +import six + from pyspider.libs import utils from pyspider.libs.log import SaveLogHandler, LogFormatter logger = logging.getLogger("processor") -class ProjectManager(object): +class ProjectManager(): """ load projects from projectdb, update project """ @@ -97,11 +98,11 @@ def _need_update(self, project_name, updatetime=None, md5sum=None): '''Check if project_name need update''' if project_name not in self.projects: return True - elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'): + if md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'): return True - elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0): + if updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0): return True - elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL: + if time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL: return True return False @@ -154,7 +155,7 @@ def get(self, project_name, updatetime=None, md5sum=None): return self.projects.get(project_name, None) -class ProjectLoader(object): +class ProjectLoader(): '''ProjectLoader class for sys.meta_path''' def __init__(self, project, mod=None): @@ -165,7 +166,7 @@ def __init__(self, project, mod=None): def load_module(self, fullname): if self.mod is None: - self.mod = mod = imp.new_module(fullname) + self.mod = mod = importlib.new_module(fullname) else: mod = self.mod mod.__file__ = '<%s>' % self.name @@ -193,7 +194,7 @@ def get_source(self, fullname): if six.PY2: - class ProjectFinder(object): + class ProjectFinder(): '''ProjectFinder class for sys.meta_path''' def __init__(self, projectdb): @@ -216,7 +217,7 @@ def find_module(self, fullname, path=None): return ProjectLoader(info) def load_module(self, fullname): - mod = imp.new_module(fullname) + mod = importlib.new_module(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] @@ -257,7 +258,7 @@ def find_module(self, fullname, path): class ProjectsLoader(importlib.abc.InspectLoader): def load_module(self, fullname): - mod = imp.new_module(fullname) + mod = importlib.new_module(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] diff --git a/pyspider/result/result_worker.py b/pyspider/result/result_worker.py index 16935fa18..43140a2b0 100644 --- a/pyspider/result/result_worker.py +++ b/pyspider/result/result_worker.py @@ -12,7 +12,7 @@ logger = logging.getLogger("result") -class ResultWorker(object): +class ResultWorker(): """ do with result @@ -37,9 +37,8 @@ def on_result(self, task, result): url=task['url'], result=result ) - else: - logger.warning('result UNKNOW -> %.30r' % result) - return + logger.warning('result UNKNOW -> %.30r' % result) + return def quit(self): self._quit = True diff --git a/pyspider/run.py b/pyspider/run.py index 7e3333c5f..26fb73056 100755 --- a/pyspider/run.py +++ b/pyspider/run.py @@ -8,12 +8,12 @@ import os import sys -import six import copy import time import shutil import logging import logging.config +import six import click import pyspider diff --git a/pyspider/scheduler/scheduler.py b/pyspider/scheduler/scheduler.py index 084baff28..e0d8470d9 100644 --- a/pyspider/scheduler/scheduler.py +++ b/pyspider/scheduler/scheduler.py @@ -23,7 +23,7 @@ logger = logging.getLogger('scheduler') -class Project(object): +class Project(): ''' project for scheduler ''' @@ -66,8 +66,7 @@ def paused(self): logger.error('process not in task, %r', task) if task['track']['process']['ok']: break - else: - fail_cnt += 1 + fail_cnt += 1 if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM: break if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM: @@ -90,8 +89,7 @@ def paused(self): # break with enough check cnt cnt = max(cnt, self.scheduler.UNPAUSE_CHECK_NUM) break - else: - fail_cnt += 1 + fail_cnt += 1 if cnt >= self.scheduler.UNPAUSE_CHECK_NUM: if fail_cnt == cnt: self._paused = True @@ -137,7 +135,7 @@ def active(self): return self.db_status in ('RUNNING', 'DEBUG') -class Scheduler(object): +class Scheduler(): UPDATE_PROJECT_INTERVAL = 5 * 60 default_schedule = { 'priority': 0, @@ -180,7 +178,7 @@ def __init__(self, taskdb, projectdb, newtask_queue, status_queue, self._send_buffer = deque() self._quit = False self._exceptions = 0 - self.projects = dict() + self.projects = {} self._force_update_project = False self._last_update_project = 0 self._last_tick = int(time.time()) @@ -361,7 +359,7 @@ def _check_task_done(self): '%s on_get_info %r', task['project'], task['track'].get('save', {}) ) continue - elif not self.task_verify(task): + if not self.task_verify(task): continue self.on_task_status(task) cnt += 1 @@ -476,11 +474,11 @@ def _check_select(self): taskids = [] cnt = 0 - cnt_dict = dict() + cnt_dict = {} limit = self.LOOP_LIMIT # dynamic assign select limit for each project, use qsize as weight - project_weights, total_weight = dict(), 0 + project_weights, total_weight = {}, 0 for project in itervalues(self.projects): # type:Project if not project.active: continue @@ -714,7 +712,7 @@ def xmlrpc_run(self, port=23333, bind='127.0.0.1', logRequests=False): def dump_counter(_time, _type): try: return self._cnt[_time].to_dict(_type) - except: + except Exception: logger.exception('') application.register_function(dump_counter, 'counter') @@ -819,8 +817,7 @@ def on_request(self, task): fields=self.merge_task_fields) if oldtask: return self.on_old_request(task, oldtask) - else: - return self.on_new_request(task) + return self.on_new_request(task) def on_new_request(self, task): '''Called when a new request is arrived''' @@ -971,21 +968,20 @@ def on_task_failed(self, task): self._cnt['all'].event((project, 'failed'), +1).event((project, 'pending'), -1) logger.info('task failed %(project)s:%(taskid)s %(url)s' % task) return task - else: - task['schedule']['retried'] = retried + 1 - task['schedule']['exetime'] = time.time() + next_exetime - task['lastcrawltime'] = time.time() - self.update_task(task) - self.put_task(task) + task['schedule']['retried'] = retried + 1 + task['schedule']['exetime'] = time.time() + next_exetime + task['lastcrawltime'] = time.time() + self.update_task(task) + self.put_task(task) - project = task['project'] - self._cnt['5m'].event((project, 'retry'), +1) - self._cnt['1h'].event((project, 'retry'), +1) - self._cnt['1d'].event((project, 'retry'), +1) - # self._cnt['all'].event((project, 'retry'), +1) - logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % ( - retried, retries), task) - return task + project = task['project'] + self._cnt['5m'].event((project, 'retry'), +1) + self._cnt['1h'].event((project, 'retry'), +1) + self._cnt['1d'].event((project, 'retry'), +1) + # self._cnt['all'].event((project, 'retry'), +1) + logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % ( + retried, retries), task) + return task def on_select_task(self, task): '''Called when a task is selected to fetch & process''' @@ -1024,7 +1020,7 @@ def _check_select(self): interactive mode of select tasks """ if not self.interactive: - return super(OneScheduler, self)._check_select() + return super()._check_select() # waiting for running tasks if self.running_task > 0: @@ -1112,7 +1108,7 @@ def __getattr__(self, name): def on_task_status(self, task): """Ignore not processing error in interactive mode""" if not self.interactive: - super(OneScheduler, self).on_task_status(task) + super().on_task_status(task) try: procesok = task['track']['process']['ok'] @@ -1187,7 +1183,7 @@ class ThreadBaseScheduler(Scheduler): def __init__(self, threads=4, *args, **kwargs): self.local = threading.local() - super(ThreadBaseScheduler, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) if isinstance(self.taskdb, SQLiteMixin): self.threads = 1 @@ -1263,8 +1259,7 @@ def _run_in_thread(self, method, *args, **kwargs): if block: time.sleep(0.1) continue - else: - queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)] + queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)] break else: queue = self.thread_queues[i % len(self.thread_queues)] @@ -1296,5 +1291,5 @@ def _load_put_task(self, project, taskid): self._run_in_thread(Scheduler._load_put_task, self, project, taskid, _i=i) def run_once(self): - super(ThreadBaseScheduler, self).run_once() + super().run_once() self._wait_thread() diff --git a/pyspider/scheduler/task_queue.py b/pyspider/scheduler/task_queue.py index a6d02e3a5..fd68e6ef3 100644 --- a/pyspider/scheduler/task_queue.py +++ b/pyspider/scheduler/task_queue.py @@ -13,9 +13,10 @@ try: from UserDict import DictMixin except ImportError: - from collections import Mapping as DictMixin -from .token_bucket import Bucket + from collections.abc import Mapping as DictMixin from six.moves import queue as Queue +from .token_bucket import Bucket + logger = logging.getLogger('scheduler') @@ -25,7 +26,7 @@ cmp = lambda x, y: (x > y) - (x < y) -class AtomInt(object): +class AtomInt(): __value__ = 0 __mutex__ = threading.RLock() @@ -75,7 +76,7 @@ class PriorityTaskQueue(Queue.Queue): def _init(self, maxsize): self.queue = [] - self.queue_dict = dict() + self.queue_dict = {} def _qsize(self, len=len): return len(self.queue_dict) @@ -128,7 +129,7 @@ def __delitem__(self, taskid): self.queue_dict.pop(taskid).taskid = None -class TaskQueue(object): +class TaskQueue(): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' @@ -190,16 +191,16 @@ def _check_processing(self): def put(self, taskid, priority=0, exetime=0): """ Put a task into task queue - + when use heap sort, if we put tasks(with the same priority and exetime=0) into queue, the queue is not a strict FIFO queue, but more like a FILO stack. - It is very possible that when there are continuous big flow, the speed of select is + It is very possible that when there are continuous big flow, the speed of select is slower than request, resulting in priority-queue accumulation in short time. - In this scenario, the tasks more earlier entering the priority-queue will not get - processed until the request flow becomes small. - - Thus, we store a global atom self increasing value into task.sequence which represent - the task enqueue sequence. When the comparison of exetime and priority have no + In this scenario, the tasks more earlier entering the priority-queue will not get + processed until the request flow becomes small. + + Thus, we store a global atom self increasing value into task.sequence which represent + the task enqueue sequence. When the comparison of exetime and priority have no difference, we compare task.sequence to ensure that the entire queue is ordered. """ now = time.time() diff --git a/pyspider/scheduler/token_bucket.py b/pyspider/scheduler/token_bucket.py index e7bb1b308..9149af0d4 100644 --- a/pyspider/scheduler/token_bucket.py +++ b/pyspider/scheduler/token_bucket.py @@ -12,7 +12,7 @@ import dummy_threading as _threading -class Bucket(object): +class Bucket(): ''' traffic flow control with token bucket diff --git a/pyspider/webui/app.py b/pyspider/webui/app.py index 2261fd6e6..b70ed0a50 100644 --- a/pyspider/webui/app.py +++ b/pyspider/webui/app.py @@ -8,7 +8,6 @@ import os import sys import logging -logger = logging.getLogger("webui") from six import reraise from six.moves import builtins @@ -16,6 +15,8 @@ from flask import Flask from pyspider.fetcher import tornado_fetcher +logger = logging.getLogger("webui") + if os.name == 'nt': import mimetypes mimetypes.add_type("text/css", ".css", True) @@ -96,7 +97,7 @@ def quit(self): 'taskdb': None, 'projectdb': None, 'scheduler_rpc': None, - 'queues': dict(), + 'queues': {}, 'process_time_limit': 30, }) @@ -108,10 +109,9 @@ def cdn_url_handler(error, endpoint, kwargs): # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') return urljoin(cdn, path) + exc_type, exc_value, tb = sys.exc_info() + if exc_value is error: + reraise(exc_type, exc_value, tb) else: - exc_type, exc_value, tb = sys.exc_info() - if exc_value is error: - reraise(exc_type, exc_value, tb) - else: - raise error + raise error app.handle_url_build_error = cdn_url_handler diff --git a/pyspider/webui/debug.py b/pyspider/webui/debug.py index 6a0694139..8308c5353 100644 --- a/pyspider/webui/debug.py +++ b/pyspider/webui/debug.py @@ -74,7 +74,7 @@ def run(project): except Exception: result = { 'fetch_result': "", - 'logs': u'task json error', + 'logs': 'task json error', 'follows': [], 'messages': [], 'result': None, @@ -95,7 +95,7 @@ def run(project): if not info: result = { 'fetch_result': "", - 'logs': u' in wevdav mode, cannot load script', + 'logs': ' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, @@ -143,7 +143,7 @@ def run(project): 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text - if (response.headers.get('content-type', '').startswith('image')): + if response.headers.get('content-type', '').startswith('image'): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) diff --git a/pyspider/webui/index.py b/pyspider/webui/index.py index 381131d09..d62228a51 100644 --- a/pyspider/webui/index.py +++ b/pyspider/webui/index.py @@ -7,14 +7,12 @@ import socket -from six import iteritems, itervalues -from flask import render_template, request, json - +from six import iteritems try: import flask_login as login except ImportError: from flask.ext import login - +from flask import render_template, request, json from .app import app index_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime'] @@ -86,9 +84,8 @@ def project_update(): app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200 - else: - app.logger.warning("[webui index] projectdb.update() error - res: {}".format(ret)) - return 'update error', 500 + app.logger.warning("[webui index] projectdb.update() error - res: {}".format(ret)) + return 'update error', 500 @app.route('/counter') diff --git a/pyspider/webui/login.py b/pyspider/webui/login.py index d32d5b73a..6e1eed55d 100644 --- a/pyspider/webui/login.py +++ b/pyspider/webui/login.py @@ -6,11 +6,11 @@ # Created on 2014-12-10 20:36:27 import base64 -from flask import Response try: import flask_login as login except ImportError: from flask.ext import login +from flask import Response from .app import app login_manager = login.LoginManager() diff --git a/pyspider/webui/result.py b/pyspider/webui/result.py index 84305bb31..44e0fc2aa 100644 --- a/pyspider/webui/result.py +++ b/pyspider/webui/result.py @@ -9,9 +9,10 @@ from flask import render_template, request, json from flask import Response -from .app import app from pyspider.libs import result_dump +from .app import app + @app.route('/results') def result(): @@ -46,9 +47,9 @@ def dump_result(project, _format): valid = request.args.get('style', 'rows') == 'full' return Response(result_dump.dump_as_json(results, valid), mimetype='application/json') - elif _format == 'txt': + if _format == 'txt': return Response(result_dump.dump_as_txt(results), mimetype='text/plain') - elif _format == 'csv': + if _format == 'csv': return Response(result_dump.dump_as_csv(results), mimetype='text/csv') diff --git a/pyspider/webui/webdav.py b/pyspider/webui/webdav.py index 5483dbf19..41aa04bb0 100644 --- a/pyspider/webui/webdav.py +++ b/pyspider/webui/webdav.py @@ -32,8 +32,7 @@ def check_user(environ): if username == app.config['webui_username'] \ and password == app.config['webui_password']: return True - else: - return False + return False class ContentIO(BytesIO): @@ -44,7 +43,7 @@ def close(self): class ScriptResource(DAVNonCollection): def __init__(self, path, environ, app, project=None): - super(ScriptResource, self).__init__(path, environ) + super().__init__(path, environ) self.app = app self.new_project = False @@ -102,14 +101,14 @@ def getContent(self): def beginWrite(self, contentType=None): if self.readonly: self.app.logger.error('webdav.beginWrite readonly') - return super(ScriptResource, self).beginWrite(contentType) + return super().beginWrite(contentType) self.writebuffer = ContentIO() return self.writebuffer def endWrite(self, withErrors): if withErrors: self.app.logger.error('webdav.endWrite error: %r', withErrors) - return super(ScriptResource, self).endWrite(withErrors) + return super().endWrite(withErrors) if not self.writebuffer: return projectdb = self.app.config['projectdb'] @@ -126,13 +125,12 @@ def endWrite(self, withErrors): self.project.update(info) self.new_project = False return projectdb.insert(self.project_name, self.project) - else: - return projectdb.update(self.project_name, info) + return projectdb.update(self.project_name, info) class RootCollection(DAVCollection): def __init__(self, path, environ, app): - super(RootCollection, self).__init__(path, environ) + super().__init__(path, environ) self.app = app self.projectdb = self.app.config['projectdb'] @@ -164,7 +162,7 @@ def getMemberNames(self): class ScriptProvider(DAVProvider): def __init__(self, app): - super(ScriptProvider, self).__init__() + super().__init__() self.app = app def __repr__(self): @@ -175,11 +173,10 @@ def getResourceInst(self, path, environ): if path in ('/', '.', ''): path = '/' return RootCollection(path, environ, self.app) - else: - return ScriptResource(path, environ, self.app) + return ScriptResource(path, environ, self.app) -class NeedAuthController(object): +class NeedAuthController(): def __init__(self, app): self.app = app diff --git a/setup.py b/setup.py index 2512f4708..e205f5ca9 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,17 @@ # Created on 2014-11-24 22:27:45 -import sys -from setuptools import setup, find_packages from codecs import open from os import path +from setuptools import setup, find_packages + +import pyspider here = path.abspath(path.dirname(__file__)) with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() -import pyspider + install_requires = [ 'Flask==0.10', diff --git a/tests/data_handler.py b/tests/data_handler.py index 3f77235c7..1f3fed04f 100644 --- a/tests/data_handler.py +++ b/tests/data_handler.py @@ -9,7 +9,7 @@ import time from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every -class IgnoreHandler(object): +class IgnoreHandler(): pass class TestHandler(BaseHandler): @@ -42,7 +42,7 @@ def raise_exception(self): raise Exception('exception') def add_task(self, response): - self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'}) + self.crawl('http://www.google.com', callback='echo', params={'wd': '中文'}) self.send_message('some_project', {'some': 'message'}) @every @@ -59,4 +59,3 @@ def generator(self, response): def sleep(self, response): time.sleep(response.save) - diff --git a/tests/test_base_handler.py b/tests/test_base_handler.py index 317e12a60..65fd06301 100644 --- a/tests/test_base_handler.py +++ b/tests/test_base_handler.py @@ -43,7 +43,7 @@ def test_task_join_crawl_config(self): 'c': 'd', # should add header c } } - + ret = BaseHandler.task_join_crawl_config(task, crawl_config) self.assertDictEqual(ret, { 'taskid': 'taskid', @@ -67,4 +67,4 @@ def test_task_join_crawl_config(self): 'callback': 'callback', 'save': [1, 2, 3], }, - }); + }) diff --git a/tests/test_bench.py b/tests/test_bench.py index 9b584700f..08299691c 100644 --- a/tests/test_bench.py +++ b/tests/test_bench.py @@ -6,9 +6,6 @@ # Created on 2014-12-10 01:34:09 import os -import sys -import time -import click import shutil import inspect import unittest diff --git a/tests/test_counter.py b/tests/test_counter.py index 03ceb4203..4d0f697e2 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -5,7 +5,6 @@ # http://binux.me # Created on 2015-04-05 00:05:58 -import sys import time import unittest @@ -44,7 +43,7 @@ def test_020_delete(self): c.event(('a', 'b'), 1) c.event(('a', 'c'), 1) c.event(('b', 'c'), 1) - + self.assertIsNotNone(c['a']) self.assertIsNotNone(c['b']) diff --git a/tests/test_database.py b/tests/test_database.py index f9d563a3b..3504b9fa7 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -8,7 +8,6 @@ from __future__ import unicode_literals, division import os -import six import time import unittest @@ -16,7 +15,7 @@ from pyspider.database.base.taskdb import TaskDB -class TaskDBCase(object): +class TaskDBCase(): sample_task = { 'taskid': 'taskid', 'project': 'project', @@ -58,7 +57,7 @@ class TaskDBCase(object): 'time': 10, 'follows': 3, 'outputs': 5, - 'exception': u"中文", + 'exception': "中文", }, }, 'lastcrawltime': time.time(), @@ -66,7 +65,7 @@ class TaskDBCase(object): } @classmethod - def setUpClass(self): + def setUpClass(cls): raise NotImplementedError # this test not works for mongodb @@ -155,7 +154,7 @@ def test_z20_update_projects(self): self.taskdb.UPDATE_PROJECTS_TIME = saved -class ProjectDBCase(object): +class ProjectDBCase(): sample_project = { 'name': 'name', 'script': 'import time\nprint(time.time(), "!@#$%^&*()\';:<>?/|")', @@ -170,7 +169,7 @@ def setUpClass(self): def test_10_insert(self): self.projectdb.insert('abc', self.sample_project) - self.projectdb.insert(u'name中文', self.sample_project) + self.projectdb.insert('name中文', self.sample_project) project = self.projectdb.get('abc') self.assertIsNotNone(project) @@ -183,7 +182,7 @@ def test_20_get_all(self): for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'): self.assertIn(key, project) - self.assertEqual(project['name'], u'abc') + self.assertEqual(project['name'], 'abc') self.assertEqual(project['status'], self.sample_project['status']) self.assertEqual(project['script'], self.sample_project['script']) self.assertEqual(project['rate'], self.sample_project['rate']) @@ -232,20 +231,20 @@ def test_50_get(self): self.assertEqual(project['name'], 'abc') self.assertEqual(project['status'], 'RUNNING') - project = self.projectdb.get(u'name中文', ['group', 'status', 'name']) - self.assertEqual(project['name'], u'name中文') + project = self.projectdb.get('name中文', ['group', 'status', 'name']) + self.assertEqual(project['name'], 'name中文') self.assertIn('status', project) self.assertNotIn('gourp', project) def test_z10_drop(self): - self.projectdb.insert(u'drop_project2', self.sample_project) - self.projectdb.insert(u'drop_project3', self.sample_project) + self.projectdb.insert('drop_project2', self.sample_project) + self.projectdb.insert('drop_project3', self.sample_project) self.projectdb.drop('drop_project3') self.assertIsNotNone(self.projectdb.get('drop_project2')) self.assertIsNone(self.projectdb.get('drop_project3')) -class ResultDBCase(object): +class ResultDBCase(): @classmethod def setUpClass(self): diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py index 02ace999c..2d2ba6733 100644 --- a/tests/test_fetcher.py +++ b/tests/test_fetcher.py @@ -10,22 +10,25 @@ import copy import time import socket -import umsgpack import subprocess import unittest import logging import logging.config +import umsgpack + +from pyspider.libs import utils +from pyspider.libs.multiprocessing_queue import Queue +from pyspider.libs.response import rebuild_response +from pyspider.fetcher.tornado_fetcher import Fetcher + logging.config.fileConfig("pyspider/logging.conf") try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client -from pyspider.libs import utils -from pyspider.libs.multiprocessing_queue import Queue -from pyspider.libs.response import rebuild_response -from pyspider.fetcher.tornado_fetcher import Fetcher + class TestFetcher(unittest.TestCase): @@ -173,7 +176,7 @@ def test_50_base64_data(self): self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) - self.assertIn(u'中文', response.json['form'], response.json) + self.assertIn('中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) @@ -459,7 +462,7 @@ def setUpClass(self): '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830' - + @classmethod def tearDownClass(self): self.rpc("close")() diff --git a/tests/test_fetcher_processor.py b/tests/test_fetcher_processor.py index 44cf2c1d3..5956c1bb5 100644 --- a/tests/test_fetcher_processor.py +++ b/tests/test_fetcher_processor.py @@ -7,15 +7,16 @@ import os import time -import httpbin import subprocess import unittest +import httpbin + +from six.moves.queue import Queue from pyspider.database.local.projectdb import ProjectDB from pyspider.fetcher import Fetcher from pyspider.processor import Processor from pyspider.libs import utils, dataurl -from six.moves.queue import Queue from tests.data_fetcher_processor_handler import Handler @@ -154,22 +155,22 @@ def test_40_method(self): def test_50_params(self): status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'roy': 'binux', - u'中文': '.', + '中文': '.', }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) - self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'}) + self.assertEqual(result['args'], {'roy': 'binux', '中文': '.'}) def test_60_data(self): status, newtasks, result = self.crawl(self.httpbin + '/post', data={ 'roy': 'binux', - u'中文': '.', + '中文': '.', }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) - self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'}) + self.assertEqual(result['form'], {'roy': 'binux', '中文': '.'}) def test_70_redirect(self): status, newtasks, result = self.crawl(self.httpbin + '/redirect-to?url=/get', callback=self.json) @@ -189,7 +190,7 @@ def test_80_redirect_too_many(self): def test_90_files(self): status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT', - files={os.path.basename(__file__): open(__file__).read()}, + files={os.path.basename(__file__): open(__file__, encoding='utf-8').read()}, callback=self.json) self.assertStatusOk(status) @@ -198,7 +199,7 @@ def test_90_files(self): def test_a100_files_with_data(self): status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT', - files={os.path.basename(__file__): open(__file__).read()}, + files={os.path.basename(__file__): open(__file__, encoding='utf-8').read()}, data={ 'roy': 'binux', # '中文': '.', # FIXME: not work @@ -293,11 +294,11 @@ def test_a170_last_modified(self): def test_a180_save(self): status, newtasks, result = self.crawl(callback=self.get_save, - save={'roy': 'binux', u'中文': 'value'}) + save={'roy': 'binux', '中文': 'value'}) self.assertStatusOk(status) self.assertFalse(newtasks) - self.assertEqual(result, {'roy': 'binux', u'中文': 'value'}) + self.assertEqual(result, {'roy': 'binux', '中文': 'value'}) def test_a190_taskid(self): status, newtasks, result = self.crawl(callback=self.get_save, @@ -434,7 +435,7 @@ def test_zzz_etag_not_working(self): self.assertTrue(result) def test_zzz_unexpected_crawl_argument(self): - with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"): + with self.assertRaisesRegex(TypeError, "unexpected keyword argument"): self.crawl(self.httpbin + '/cache', cookie={}, callback=self.json) def test_zzz_curl_get(self): @@ -465,18 +466,18 @@ def test_zzz_curl_put(self): self.assertIn('fileUpload1', result['files'], result) def test_zzz_curl_no_url(self): - with self.assertRaisesRegexp(TypeError, 'no URL'): + with self.assertRaisesRegex(TypeError, 'no URL'): status, newtasks, result = self.crawl( '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''', callback=self.json) def test_zzz_curl_bad_option(self): - with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): + with self.assertRaisesRegex(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin, callback=self.json) - with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): + with self.assertRaisesRegex(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin, callback=self.json) @@ -490,4 +491,4 @@ def test_zzz_connect_timeout(self): start_time = time.time() status, newtasks, result = self.crawl('http://240.0.0.1/', connect_timeout=5, callback=self.catch_http_error) end_time = time.time() - self.assertTrue(5 <= end_time - start_time <= 6) \ No newline at end of file + self.assertTrue(5 <= end_time - start_time <= 6) diff --git a/tests/test_message_queue.py b/tests/test_message_queue.py index d5e19559b..03bd8cfc1 100644 --- a/tests/test_message_queue.py +++ b/tests/test_message_queue.py @@ -6,15 +6,15 @@ # Created on 2014-10-07 10:33:38 import os -import six import time import unittest +import six -from pyspider.libs import utils from six.moves import queue as Queue +from pyspider.libs import utils -class TestMessageQueue(object): +class TestMessageQueue(): @classmethod def setUpClass(self): @@ -41,9 +41,9 @@ def test_30_full(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) for i in range(2): - self.q1.put_nowait('TEST_DATA%d' % i) + self.q1.put_nowait(f'TEST_DATA{i}') for i in range(3): - self.q2.put('TEST_DATA%d' % i) + self.q2.put(f'TEST_DATA{i}') with self.assertRaises(Queue.Full): self.q1.put('TEST_DATA6', timeout=0.01) @@ -101,9 +101,9 @@ def test_30_full(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) for i in range(2): - self.q1.put_nowait('TEST_DATA%d' % i) + self.q1.put_nowait(f'TEST_DATA{i}') for i in range(3): - self.q2.put('TEST_DATA%d' % i) + self.q2.put(f'TEST_DATA{i}') print(self.q1.__dict__) print(self.q1.qsize()) diff --git a/tests/test_processor.py b/tests/test_processor.py index 1a07960cb..db4364429 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -6,16 +6,15 @@ # Created on 2014-02-22 14:00:05 import os -import six -import copy import time import unittest import logging.config -logging.config.fileConfig("pyspider/logging.conf") from pyspider.libs import utils from pyspider.processor.project_module import ProjectManager +logging.config.fileConfig("pyspider/logging.conf") + class TestProjectModule(unittest.TestCase): @@ -65,7 +64,7 @@ def fetch_result(self): def setUp(self): self.project = "test.project" - self.script = open(os.path.join(os.path.dirname(__file__), 'data_handler.py')).read() + self.script = open(os.path.join(os.path.dirname(__file__), 'data_handler.py'), encoding='utf-8').read() self.env = { 'test': True, } diff --git a/tests/test_response.py b/tests/test_response.py index 4b9bbf094..cd47b6c4f 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -6,20 +6,23 @@ # Created on 2015-01-18 11:10:27 -import os import copy import time -import httpbin import unittest import logging import logging.config -logging.config.fileConfig("pyspider/logging.conf") + +import httpbin from pyspider.libs import utils from pyspider.libs.response import rebuild_response from pyspider.fetcher.tornado_fetcher import Fetcher +logging.config.fileConfig("pyspider/logging.conf") + + + class TestResponse(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', diff --git a/tests/test_result_dump.py b/tests/test_result_dump.py index 0d6e933e7..00ff9defc 100644 --- a/tests/test_result_dump.py +++ b/tests/test_result_dump.py @@ -7,7 +7,6 @@ from __future__ import unicode_literals, division -import six import csv import time import json @@ -25,7 +24,7 @@ results2 = results1 + [ {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), - 'result': [1, 2, '中文', u'中文'] }, + 'result': [1, 2, '中文', '中文'] }, ] results_error = results2 + [ diff --git a/tests/test_result_worker.py b/tests/test_result_worker.py index 9933cfed8..e80670080 100644 --- a/tests/test_result_worker.py +++ b/tests/test_result_worker.py @@ -9,7 +9,6 @@ import time import unittest import logging.config -logging.config.fileConfig("pyspider/logging.conf") import shutil from pyspider.database.sqlite import resultdb @@ -17,6 +16,9 @@ from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread +logging.config.fileConfig("pyspider/logging.conf") + + class TestProcessor(unittest.TestCase): resultdb_path = './data/tests/result.db' diff --git a/tests/test_run.py b/tests/test_run.py index 490844ee4..81075a9e1 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -8,15 +8,14 @@ from __future__ import print_function import os -import sys -import six import time import json import signal import shutil import inspect -import requests import unittest +import requests + from pyspider import run from pyspider.libs import utils @@ -29,7 +28,6 @@ def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') - import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' @@ -59,7 +57,7 @@ def test_10_cli(self): self.assertEqual(len(ctx.obj.instances), 0) def test_20_cli_config(self): - with open('./data/tests/config.json', 'w') as fp: + with open('./data/tests/config.json', 'w', encoding='utf-8') as fp: json.dump({ 'debug': True, 'taskdb': 'mysql+taskdb://localhost:23456/taskdb', @@ -263,7 +261,7 @@ def test_a100_all(self): rv = requests.get('http://localhost:5000/results?project=data_sample_handler') self.assertIn('url', rv.text) self.assertIn('class=url', rv.text) - except: + except Exception: raise finally: time.sleep(1) @@ -312,18 +310,18 @@ def wait_text(timeout=1): text = wait_text() self.assertIn('task done data_sample_handler:on_start', text) - os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin)) + os.write(fd, utils.utf8(f'crawl("{self.httpbin}/pyspider/test.html")\n')) text = wait_text() self.assertIn('/robots.txt', text) - os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin)) + os.write(fd, utils.utf8(f'crawl("{self.httpbin}/links/10/0")\n')) text = wait_text() if '"title": "Links"' not in text: - os.write(fd, utils.utf8('crawl("%s/links/10/1")\n' % self.httpbin)) + os.write(fd, utils.utf8(f'crawl("{self.httpbin}/links/10/1")\n')) text = wait_text() self.assertIn('"title": "Links"', text) - os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin)) + os.write(fd, utils.utf8(f'crawl("{self.httpbin}/404")\n')) text = wait_text() self.assertIn('task retry', text) @@ -379,4 +377,3 @@ def test_10_send_message(self): if task['url'] == 'data:,on_message': break self.assertEqual(task['process']['callback'], '_on_message') - diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 66ac000eb..41a86ff87 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -11,11 +11,13 @@ import unittest import logging import logging.config -logging.config.fileConfig("pyspider/logging.conf") from pyspider.scheduler.task_queue import TaskQueue from pyspider.libs import utils +logging.config.fileConfig("pyspider/logging.conf") + + class TestTaskQueue(unittest.TestCase): @@ -129,7 +131,7 @@ def get_resultdb(): self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) - self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) + self.rpc = xmlrpc_client.ServerProxy(f'http://localhost:{self.scheduler_xmlrpc_port}') def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), @@ -329,7 +331,7 @@ def test_60_taskdone_failed_retry(self): }, } }) # task retry 0/3 test_project:taskid url - from six.moves import queue as Queue + #from six.moves import queue as Queue # with self.assertRaises(Queue.Empty): # task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url @@ -702,7 +704,7 @@ def test_x10_inqueue_limit(self): pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put({ - 'taskid': 'taskid%d' % i, + 'taskid': f'taskid{i}', 'project': 'test_inqueue_project', 'url': 'url', 'schedule': { diff --git a/tests/test_task_queue.py b/tests/test_task_queue.py index a84fc98e6..193314cbe 100644 --- a/tests/test_task_queue.py +++ b/tests/test_task_queue.py @@ -18,8 +18,8 @@ class TestTaskQueue(unittest.TestCase): def test_task_queue_in_time_order(self): tq = TaskQueue(rate=300, burst=1000) - queues = dict() - tasks = dict() + queues = {} + tasks = {} for i in range(0, 100): it = InQueueTask(str(i), priority=int(i // 10), exetime=0) @@ -81,8 +81,8 @@ def test_time_queue(self): self.assertEqual(tq.processing.qsize(), 0) self.assertEqual(tq.time_queue.qsize(), 0) - queues = dict() - tasks = dict() + queues = {} + tasks = {} for i in range(0, 20): priority = int(i // 10) it = InQueueTask(str(i), priority=priority, exetime=time.time() + (i + 1) * interval) diff --git a/tests/test_utils.py b/tests/test_utils.py index b64a3baad..e250e8779 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,6 @@ # http://binux.me # Created on 2015-01-18 16:53:49 -import sys import time import unittest diff --git a/tests/test_webdav.py b/tests/test_webdav.py index ccb40a6e6..d8330a6a5 100644 --- a/tests/test_webdav.py +++ b/tests/test_webdav.py @@ -7,11 +7,9 @@ import os import sys -import six import time import shutil import inspect -import unittest from six import BytesIO from pyspider import run diff --git a/tests/test_webui.py b/tests/test_webui.py index 1e232cee8..bceae1bf2 100644 --- a/tests/test_webui.py +++ b/tests/test_webui.py @@ -329,7 +329,7 @@ def test_a22_active_tasks(self): self.assertIn('ok', task['track']['process']) self.assertIn('time', task['track']['process']) self.assertTrue(track) - + def test_a24_task(self): rv = self.app.get(self.task_url) diff --git a/tests/test_xmlrpc.py b/tests/test_xmlrpc.py index 736d94e8d..52d83173a 100644 --- a/tests/test_xmlrpc.py +++ b/tests/test_xmlrpc.py @@ -24,16 +24,16 @@ class TestXMLRPCServer(unittest.TestCase): @classmethod def setUpClass(self): from pyspider.libs import wsgi_xmlrpc - + def test_1(): return 'test_1' - - class Test2(object): + + class Test2(): def test_3(self, obj): return obj - + test = Test2() - + application = wsgi_xmlrpc.WSGIXMLRPCApplication() application.register_instance(Test2()) application.register_function(test_1) @@ -48,11 +48,11 @@ def test_3(self, obj): def tearDownClass(self): self.io_loop.add_callback(self.io_loop.stop) self.thread.join() - + def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'): from six.moves.xmlrpc_client import ServerProxy - + client = ServerProxy(uri) - + assert client.test_1() == 'test_1' assert client.test_3({'asdf':4}) == {'asdf':4} diff --git a/tools/migrate.py b/tools/migrate.py index f092daa6b..567b69965 100755 --- a/tools/migrate.py +++ b/tools/migrate.py @@ -5,14 +5,16 @@ # http://binux.me # Created on 2015-09-30 23:22:46 -import click import logging +from multiprocessing.pool import ThreadPool as Pool +import click + from pyspider.database.base.projectdb import ProjectDB from pyspider.database.base.taskdb import TaskDB from pyspider.database.base.resultdb import ResultDB from pyspider.database import connect_database from pyspider.libs.utils import unicode_obj -from multiprocessing.pool import ThreadPool as Pool + logging.getLogger().setLevel(logging.INFO)