-
Notifications
You must be signed in to change notification settings - Fork 106
/
index.py
119 lines (96 loc) · 3.79 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
DocSearch scraper main entry point
"""
import os
import json
import requests
from requests_iap import IAPAuth
from scrapy.crawler import CrawlerProcess
from .algolia_helper import AlgoliaHelper
from .config.config_loader import ConfigLoader
from .documentation_spider import DocumentationSpider
from .strategies.default_strategy import DefaultStrategy
from .custom_downloader_middleware import CustomDownloaderMiddleware
from .custom_dupefilter import CustomDupeFilter
from .config.browser_handler import BrowserHandler
from .strategies.algolia_settings import AlgoliaSettings
try:
# disable boto (S3 download)
from scrapy import optional_features
if 'boto' in optional_features:
optional_features.remove('boto')
except ImportError:
pass
EXIT_CODE_NO_RECORD = 3
def run_config(config):
config = ConfigLoader(config)
CustomDownloaderMiddleware.driver = config.driver
DocumentationSpider.NB_INDEXED = 0
strategy = DefaultStrategy(config)
algolia_helper = AlgoliaHelper(
config.app_id,
config.api_key,
config.index_name,
config.index_name_tmp,
AlgoliaSettings.get(config, strategy.levels),
config.query_rules
)
root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
} # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers
if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET"):
headers.update(
{
"CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"),
"CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"),
}
)
elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON"):
iap_token = IAPAuth(
client_id=os.getenv("IAP_AUTH_CLIENT_ID"),
service_account_secret_dict=json.loads(
os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")
),
)(requests.Request()).headers["Authorization"]
headers.update({"Authorization": iap_token})
DEFAULT_REQUEST_HEADERS = headers
process = CrawlerProcess({
'LOG_ENABLED': '1',
'LOG_LEVEL': 'ERROR',
'USER_AGENT': config.user_agent,
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
# Need to be > 600 to be after the redirectMiddleware
'DUPEFILTER_USE_ANCHORS': config.use_anchors,
# Use our custom dupefilter in order to be scheme agnostic regarding link provided
'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH,
'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS,
'TELNETCONSOLE_ENABLED': False
})
process.crawl(
DocumentationSpider,
config=config,
algolia_helper=algolia_helper,
strategy=strategy
)
process.start()
process.stop()
# Kill browser if needed
BrowserHandler.destroy(config.driver)
if len(config.extra_records) > 0:
algolia_helper.add_records(config.extra_records, "Extra records", False)
print("")
if DocumentationSpider.NB_INDEXED > 0:
algolia_helper.commit_tmp_index()
print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
else:
print('Crawling issue: nbHits 0 for ' + config.index_name)
exit(EXIT_CODE_NO_RECORD)
print("")
if __name__ == '__main__':
from os import environ
run_config(environ['CONFIG'])