Permalink
Browse files

update proxy crawler

1 parent 8f75668 commit 00ad6608ee98313777d3a3e095f4b320da24b657 Jiang Bian committed Nov 22, 2014
Showing with 22 additions and 5 deletions.
  1. +22 −5 scripts/crawl_proxies.py
View
@@ -11,11 +11,11 @@
import argparse, pickle, os, json, sys, time
sys.path.append("..")
-def crawl_spys_ru(page):
+def crawl_spys_ru(p):
import requests, re, lxml.html, cStringIO
from lxml import etree
- url = 'http://spys.ru/en/http-proxy-list/%d/'%page
+ url = 'http://spys.ru%s'%p
# payload = {
# 'sto': 'View+150+per+page'
@@ -70,6 +70,8 @@ def crawl_spys_ru(page):
if cnt == 3:
hh = lxml.html.tostring(td)
country = re.findall(r'<font class="spy14">(.*?)<\/font>', hh)[0]
+ # if (country == 'China'):
+ # break
cnt += 1
@@ -91,9 +93,25 @@ def crawl_spys_ru(page):
parser.add_argument('-o', '--output', help="define the location of the output;", default="proxies.json")
args = parser.parse_args()
+ # get a list of valid urls...
+ import requests, re, lxml.html, cStringIO
+ from lxml import etree
+ url = 'http://spys.ru/en/http-proxy-list/'
+
+ r = requests.get(url)
+
+ html = r.text.encode('utf8')
+
+ # the port numbers are coded...
+ urls = re.findall(r'<a href=\'(/en/http-proxy-list/\d+/.*?)\'>', html)
+
+ urls = set(urls)
+
+ urls.add('/en/http-proxy-list/')
+
proxies = []
- for i in range(5):
- proxies.extend(crawl_spys_ru(i))
+ for url in urls:
+ proxies.extend(crawl_spys_ru(url))
# check if there is a proxies.json locally, merge the check results rather than overwrite it
if (os.path.exists(os.path.abspath(args.output))):
@@ -111,7 +129,6 @@ def crawl_spys_ru(page):
ips.append(ip)
proxy_list.append({ip: proxy_type})
-
proxies = [p['proxy'] for p in proxy_checker(proxy_list)]
logger.info("number of proxies that are still alive: %d"%len(proxies))

0 comments on commit 00ad660

Please sign in to comment.