# 反爬：代理伺服器/IP

* 了解「IP 黑/白名單」的反爬蟲機制
* 「IP 黑/白名單」反爬蟲的因應策略

## 作業目標

* 目前程式中的 proxy_ips 是手動輸入的，請根據 https://free-proxy-list.net/ 寫一個可自動化抓取可用 Proxy 的 proxy_ips。




In [13]:
# proxy_ips = []

'''
Your Code
'''
from bs4 import BeautifulSoup
import requests
import random

In [14]:
proxy_ips = ['122.176.65.143:39859','115.124.86.105:37600','91.226.35.93:53281','94.74.190.52:80','68.183.105.214:8080']
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 91.226.35.93:53281
Fail
Use 115.124.86.105:37600
Fail
Use 68.183.105.214:8080
Fail
Use 94.74.190.52:80
Fail
Use 91.226.35.93:53281
Your IP address: 91.226.35.93
Use 91.226.35.93:53281
Your IP address: 91.226.35.93
Use 122.176.65.143:39859
Your IP address: 122.176.65.143
Use 122.176.65.143:39859
Fail
Use 91.226.35.93:53281
Your IP address: 91.226.35.93
Use 115.124.86.105:37600
Fail


In [1]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
import requests
import re
import math
import pandas as pd
import random

In [3]:
proxy_url = "https://free-proxy-list.net/"
entries_per_page = '80'
proxy_ips = []

browser = webdriver.Chrome(executable_path='chromedriver')
# browser = webdriver.Chrome(executable_path='./Data/chromedriver')
browser.get(proxy_url)  # 打開瀏覽器並連到網頁
time.sleep(2)  # delay一段時間等待網頁更新完成

# 顯示設定每頁幾筆選項
browser.execute_script("document.getElementById('proxylisttable_length') \
                        .style.display='inline-block';")
WebDriverWait(browser,
              2).until(EC.visibility_of_element_located((By.ID, 'proxylisttable_length')))
    
# 設定為每頁80筆
options = Select(browser.find_element_by_xpath("//div[@id='proxylisttable_length'] \
                                                //select[@name='proxylisttable_length']"))
options.select_by_value(entries_per_page)

data = []

while True:
    time.sleep(5)  # delay一段時間等待網頁更新完成
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    active_page = soup.find('li',
                            attrs={'class':'fg-button ui-button ui-state-default active'}
                           ).a.text

    # 直接用計算的方式取得總頁數
    if active_page == '1':
        entries_info = re.findall(r'\d+', soup.find('div', attrs={'id':'proxylisttable_info'}).text)
        page_total = str(math.ceil(int(entries_info[2]) / int(entries_per_page)))
        proxy_columns = soup.find('table',
                                  attrs={'class':'table table-striped table-bordered dataTable'}
                                 ).thead.find_all('th')
        columns = [name for ele in proxy_columns for name in ele]

    print('目前在爬取第{}頁/共{}頁...'.format(active_page, page_total))

    ips_list = soup.find('table',
                         attrs={'class':'table table-striped table-bordered dataTable'}
                        ).find_all('tr', attrs={'class':re.compile('odd|even')})

    # 擷取proxies
    for ips in ips_list:
        ip_info = []        
        for info in ips:
            ip_info.append(info.text)
        data.append(ip_info)
        proxy_ips.append(ip_info[0]+':'+ip_info[1])

    if active_page == page_total:
        print('沒有下一頁了...')
        break
    else:
        print('切換下一頁中...')
        browser.find_element_by_link_text('Next').click()
        continue

# 關閉瀏覽器
browser.quit();

# 列出全部proxy資料
pd.DataFrame(data, columns=columns)


目前在爬取第1頁/共4頁...
切換下一頁中...
目前在爬取第2頁/共4頁...
切換下一頁中...
目前在爬取第3頁/共4頁...
切換下一頁中...
目前在爬取第4頁/共4頁...
沒有下一頁了...


Unnamed: 0,IP Address,Port,Code,Country,Anonymity,Google,Https,Last Checked
0,183.88.16.95,8080,TH,Thailand,transparent,no,no,11 seconds ago
1,188.51.31.109,8080,SA,Saudi Arabia,transparent,no,no,11 seconds ago
2,36.37.139.2,50938,KH,Cambodia,elite proxy,no,yes,11 seconds ago
3,159.192.253.235,8080,TH,Thailand,transparent,no,no,11 seconds ago
4,138.0.230.49,63141,HN,Honduras,elite proxy,no,yes,11 seconds ago
...,...,...,...,...,...,...,...,...
295,85.117.61.186,49929,GE,Georgia,elite proxy,no,yes,21 minutes ago
296,103.35.132.50,36555,IN,India,elite proxy,no,yes,21 minutes ago
297,187.17.145.237,30279,BR,Brazil,elite proxy,no,yes,21 minutes ago
298,190.151.94.3,46615,CL,Chile,elite proxy,no,yes,21 minutes ago


In [4]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                            proxies={'http': 'http://' + ip,
                                     'https': 'https://' + ip},
                            timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 67.205.146.29:8080
Your IP address: 67.205.146.29
Use 45.174.152.14:8080
Fail
Use 196.3.97.34:23500
Fail
Use 145.239.81.69:3128
Your IP address: 145.239.81.69
Use 109.232.106.236:47524
Fail
Use 124.41.240.203:55948
Fail
Use 114.6.197.254:8080
Fail
Use 193.242.151.45:8080
Fail
Use 110.34.28.31:42890
Your IP address: 110.34.28.31
Use 139.99.91.70:8080
Fail


In [5]:
import requests
from bs4 import BeautifulSoup
import random


In [6]:
proxy_ips = []

url = 'https://free-proxy-list.net/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text,'html.parser')
table = soup.find(name='table',attrs={'id':'proxylisttable'})
tbody = table.find('tbody')
for tr in tbody:
    ip = tr.td.string
    port = tr.td.next_sibling.string
    proxy_ips.append(ip + ":" + port)
proxy_ips[:5]

['202.51.110.158:32247',
 '103.117.213.74:59283',
 '124.41.211.211:46709',
 '181.57.198.102:46960',
 '181.211.38.62:47911']

In [7]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': 'http://'+ip, 'https':'https://'+ ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('-Fail')

Use 183.91.87.45:3128
Your IP address: 183.91.87.35
Use 103.143.196.26:8080
Your IP address: 103.143.196.2
Use 186.228.20.194:80
-Fail
Use 144.217.163.138:8080
Your IP address: 144.217.163.138
Use 114.6.87.177:60811
Your IP address: 114.6.88.238
Use 176.9.75.42:8080
Your IP address: 176.9.75.42
Use 177.66.221.5:8080
-Fail
Use 88.200.63.190:80
-Fail
Use 103.25.167.200:42375
-Fail
Use 169.0.92.100:8080
Your IP address: 169.0.92.100


In [8]:
from bs4 import BeautifulSoup
import requests
import random
import re

In [9]:
r = requests.get('https://free-proxy-list.net/')
soup = BeautifulSoup(r.text, 'html5lib')

regex = '(\d{1,3}).(\d{1,3}).(\d{1,3}).(\d{1,3})'
pattern = re.compile(regex)

proxy_ips = []
for td in soup.find('table',id='proxylisttable').find_all('td'):
    td = td.text
    result = re.search(pattern, td)
    if result:
        proxy_ips.append(td)

len(proxy_ips)

300

In [10]:
for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 14.143.168.230
Fail
Use 103.216.48.83
Fail
Use 110.34.28.31
Fail
Use 185.134.23.171
Your IP address: 185.134.23.171
Use 118.137.146.95
Fail
Use 134.35.134.145
Fail
Use 24.172.225.122
Fail
Use 200.152.78.48
Fail
Use 200.89.178.216
Your IP address: 200.89.178.216
Use 185.134.23.172
Fail


In [11]:
from bs4 import BeautifulSoup
import requests
import random

proxy_ips = []

res = requests.get('https://free-proxy-list.net/')
soup = BeautifulSoup(res.text, 'html5lib')
table=soup.find(id='proxylisttable')
trs=table.find_all('tr')
print(len(trs))
for tr in trs:
    tds=tr.find_all('td')
    if (len(tds)>0):
        proxy_ips.append(tds[0].text)

302


In [12]:

for i in range(10):
    ip = random.choice(proxy_ips)
    print('Use', ip)
    try:
        resp = requests.get('http://ip.filefab.com/index.php',
                        proxies={'http': ip, 'https': ip}, timeout=10)
        soup = BeautifulSoup(resp.text, 'html5lib')
        print(soup.find('h1', id='ipd').text.strip())
    except:
        print('Fail')

Use 182.53.197.22
Fail
Use 196.3.97.34
Fail
Use 195.230.115.115
Fail
Use 46.19.100.28
Fail
Use 109.167.207.72
Fail
Use 118.97.188.50
Fail
Use 103.105.77.38
Fail
Use 118.137.146.95
Fail
Use 109.167.207.72
Fail
Use 212.24.148.234
Fail
