From 8cf4d94ab1cb3b0ec38e4810014d7edef5566ed8 Mon Sep 17 00:00:00 2001 From: Jiang Bian Date: Tue, 12 Nov 2013 20:20:35 -0600 Subject: [PATCH] only keeps http proxies, free https proxies noramlly just won't work... --- scripts/crawl_proxies.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/scripts/crawl_proxies.py b/scripts/crawl_proxies.py index 0bd1a03..afa577b 100644 --- a/scripts/crawl_proxies.py +++ b/scripts/crawl_proxies.py @@ -76,7 +76,7 @@ def crawl_spys_ru(page): if cnt > 3: break - if (proxy): + if (proxy and proxy_type == 'http'): #proxies.append((proxy, proxy_type, country)) proxies.append({proxy: proxy_type}) @@ -95,9 +95,26 @@ def crawl_spys_ru(page): for i in range(5): proxies.extend(crawl_spys_ru(i)) - proxies = [p['proxy'] for p in proxy_checker(proxies)] + # check if there is a proxies.json locally, merge the check results rather than overwrite it + if (os.path.exists(os.path.abspath(args.output))): + with open(os.path.abspath(args.output), 'rb') as proxy_f: + proxies.extend(json.load(proxy_f)['proxies']) - logger.info(len(proxies)) + + ips = [] + proxy_list = [] + for proxy in proxies: + ip = proxy.keys()[0] + proxy_type = proxy.values()[0] + + if (ip not in ips): + ips.append(ip) + proxy_list.append({ip: proxy_type}) + + + proxies = [p['proxy'] for p in proxy_checker(proxy_list)] + + logger.info("number of proxies that are still alive: %d"%len(proxies)) with open(os.path.abspath(args.output), 'wb') as proxy_f: json.dump({'proxies':proxies}, proxy_f)