# 反爬：代理伺服器/IP

* 了解「IP 黑/白名單」的反爬蟲機制
* 「IP 黑/白名單」反爬蟲的因應策略

## 作業目標

* 目前程式中的 proxy_ips 是手動輸入的，請根據 https://free-proxy-list.net/ 寫一個可自動化抓取可用 Proxy 的 proxy_ips。




In [4]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
import requests
import re
import math
import pandas as pd
import random

In [6]:
proxy_url = "https://free-proxy-list.net/"
entries_per_page = '80'
proxy_ips = []

browser = webdriver.Chrome(executable_path='chromedriver')
browser.get(proxy_url)  # 打開瀏覽器並連到網頁
time.sleep(2)  # delay一段時間等待網頁更新完成

# 顯示設定每頁幾筆選項
browser.execute_script("document.getElementById('proxylisttable_length') \
                        .style.display='inline-block';")
WebDriverWait(browser,
              2).until(EC.visibility_of_element_located((By.ID, 'proxylisttable_length')))
    
# 設定為每頁80筆
options = Select(browser.find_element_by_xpath("//div[@id='proxylisttable_length'] \
                                                //select[@name='proxylisttable_length']"))
options.select_by_value(entries_per_page)

data = []

while True:
    time.sleep(5)  # delay一段時間等待網頁更新完成
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    active_page = soup.find('li',
                            attrs={'class':'fg-button ui-button ui-state-default active'}
                           ).a.text

    # 直接用計算的方式取得總頁數
    if active_page == '1':
        entries_info = re.findall(r'\d+', soup.find('div', attrs={'id':'proxylisttable_info'}).text)
        page_total = str(math.ceil(int(entries_info[2]) / int(entries_per_page)))
        proxy_columns = soup.find('table',
                                  attrs={'class':'table table-striped table-bordered dataTable'}
                                 ).thead.find_all('th')
        columns = [name for ele in proxy_columns for name in ele]

    print('目前在爬取第{}頁/共{}頁...'.format(active_page, page_total))

    ips_list = soup.find('table',
                         attrs={'class':'table table-striped table-bordered dataTable'}
                        ).find_all('tr', attrs={'class':re.compile('odd|even')})

    # 擷取proxies
    for ips in ips_list:
        ip_info = []        
        for info in ips:
            ip_info.append(info.text)
        data.append(ip_info)
        proxy_ips.append(ip_info[0]+':'+ip_info[1])

    if active_page == page_total:
        print('沒有下一頁了...')
        break
    else:
        print('切換下一頁中...')
        browser.find_element_by_link_text('Next').click()
        continue

# 關閉瀏覽器
browser.quit();

# 列出全部proxy資料
pd.DataFrame(data, columns=columns)

目前在爬取第1頁/共4頁...
切換下一頁中...
目前在爬取第2頁/共4頁...
切換下一頁中...
目前在爬取第3頁/共4頁...
切換下一頁中...
目前在爬取第4頁/共4頁...
沒有下一頁了...


Unnamed: 0,IP Address,Port,Code,Country,Anonymity,Google,Https,Last Checked
0,95.85.36.236,8080,NL,Netherlands,anonymous,no,no,9 seconds ago
1,139.59.169.246,8080,GB,United Kingdom,anonymous,no,no,9 seconds ago
2,200.89.178.210,3128,AR,Argentina,anonymous,no,no,9 seconds ago
3,103.111.182.44,80,NZ,New Zealand,anonymous,no,yes,9 seconds ago
4,27.147.136.178,47678,BD,Bangladesh,elite proxy,no,no,9 seconds ago
...,...,...,...,...,...,...,...,...
295,182.253.174.202,8080,ID,Indonesia,transparent,no,no,13 minutes ago
296,180.246.202.98,80,ID,Indonesia,transparent,no,no,13 minutes ago
297,41.65.181.133,8080,EG,Egypt,transparent,no,no,13 minutes ago
298,109.166.89.125,8080,SA,Saudi Arabia,transparent,no,no,13 minutes ago


In [7]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 200.89.174.64:8080
Your IP address: 200.89.174.64
Use 45.7.132.86:999
Fail
Use 41.65.201.164:8080
Fail
Use 78.38.111.243:8080
Fail
Use 125.25.197.97:8080
Fail
Use 185.201.5.159:8080
Fail
Use 186.220.249.23:8080
Fail
Use 45.32.177.5:31285
Your IP address: 45.32.177.5
Use 200.89.178.216:80
Your IP address: 200.89.178.216
Use 138.68.161.60:8080
Your IP address: 138.68.161.60
