# 利用urllib提出請求

會用到的:
1. url
2. useragent
3. headers
4. request.Request(url = , headers = )
5. request.urlopen()

In [None]:
from urllib import request

url = '目標網址'
useragent = 'User-Agent:'

#使用headers
headers = {'User-Agent' : useragent}

#提出請求
req = request.Request(url = url, headers = headers)

#取得回應
res = request.urlopen(req)

#輸出res，用utf-8解碼
print(res.read().decode('utf-8'))

# 利用requests提出請求

會用到的:
1. url
2. useragent
3. headers
4. requests.get(url, headers = )
5. .text

In [None]:
import requests

url = 'https://www.ptt.cc/bbs/movie/index.html'
useragent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'

headers = {'User-Agent' : useragent}

#用get提出請求
res = requests.get(url, headers = headers)

#print(res.text)

# 將html轉成BeutifulSoup物件

會用到的:
1. BeautifulSoup(res, 'html.parser')
---

## find()、findAll()介紹

* find() 
: 只會找出第一個 
* findAll() find_all()
: 列出所有符合條件的標籤  
* 回傳值為list

1. findAll(tag, attribute)  
2. soup.findAll('div', {'id': '  ', 'class': ['cl1', 'cl2']})
---

## select_one()、select()介紹

* select_one() 只會找出第一個 
* select() 列出所有符合條件的標籤，可以使用CSS選擇器  
* 回傳值為list

1. select('tag[attribute]')  
    #找a標籤   
2. select('tag#attribute.class a')  


In [None]:
from bs4 import BeautifulSoup

# 使用html.parser
# urllib的回應
#soup = BeautifulSoup(res, 'html.parser')

# requests的回應
soup = BeautifulSoup(res.text, 'html.parser')

#print(soup)
#print(soup.prettify())

In [None]:
# 想查看特定標籤的html
bar = soup.findAll('div', {'id' : 'action-bar-container'})
# bar = soup.findAll('div', id = '  ')
print(bar)
# 取出的list都是BeautifulSoup的物件，因此可以對裡面的再繼續使用find()或findAll()

# 查看第一個a標籤
# 回傳值也是BeautifulSoup的物件
bar_a = bar[0].find('a')
print("第一個a標籤：")
print(bar_a)

# 查看第一個a標籤的文字
bar_a_text = bar_a.text
print("文字：")
print(bar_a_text)

# 取得a標籤內的href網址
bar_a_url = bar_a['href']
print("網址：")
print(bar_a_url)

In [None]:
#另一種方式:
print(bar[0].find('a'))
# 等同於bar[0].a

# 如果要找下一個a標籤，使用.next_sibling
# 找複數個使用.next_siblings，然後使用for遍歷
for i in bar[0].find('a').next_siblings:
    print(i)

In [None]:
print("用select找a：")
print(bar[0].select('a'))

print("網址：")
print(bar[0].select_one('a')['href'])

# POST方式請求

1. 使用urllib
2. 使用requests

## 取得Hidden value

- 取得hidden標籤

In [None]:
# 使用urllib
from urllib import request, parse

#查詢結果的url
url = " "
headers = {'User-Agent' : ''}
# 表單資料
data = {'' : '', '': ''}
data = bytes(parse.urlencode(data), encoding = 'utf-8')
req = request.Requests(url = url, data = data, headers = headers)
res = request.urlopen(req).read().decode('utf-8')

In [None]:
# 使用requests
import requests

url = ''
headers = {'User-Agent' : ''}

# 表單資料
data = {}

#用post提出請求
res = requests.post(url, data=data, headers=headers)

#print(res.text)

In [None]:
#取得hidden標籤
soup = BeautifulSoup(res.text, 'html.parser')

for i in soup.select("input[type='hidden']"):
    try:
        #print("%s:\t%s" % (i['name'], i['value']))
        print("{}: {}".format(i['name'], i['value']))
    except KeyError:
        pass

# 帶cookies發出請求

- urllib
    - 直接將cookies放在headers  
    ```
    例如PTT Gossiping已滿18歲:Cookie over18=1
    headers = {'User-Agent' = '', 'Cookie' : 'name=value'}
    ```
- requests
    - 直接將cookies放在requests.get()的參數中  
    ```
    cookies = {'key' = 'value'}  
    res = requests.get(url, headers = headers, cookies = cookies)
    ```

# Session

- 沒有建立session的情況，所有requests都是新的session
- 事先建立session，可以讓同一連線的所有請求共用cookies
```
ss = requests.session()  
ss.cookies['name'] = 'value'  
res = ss.get(url, headers)
```

In [None]:
headers = {'User-Agent' : ''}
#post發出前頁面
url_landing_page = ''
#存放action網址
url_tmp = ''
#目標網址
url_index = ''

#空字典存放post data
data = {}

#建立session
ss = requests.session()

res_landing_page = ss.get(url_landing_page, headers = headers)
soup_landing_page = BeautifulSoup(res_landing_page.text, 'html.parser')

#目標標籤(已滿18歲)
button = soup_landing_page.select('button[class=" "]')[0]
button_key = button['name']
button_value = button['value']
data[button_key] = button_value

#隱藏標籤
hidden = soup_landing_page.select('input[type="hidden"]')[0]
hidden_key = hidden['name']
hidden_value = hidden['value']
data[hidden_key] = hidden_value

url_tmp = '目標網域' + soup_landing_page.select('form')[0]['action']

#帶著cookies發出request
ss.post(url_tmp, data = data, headers = headers)

#之後的session都帶著cookies
res_index = ss.get(url_index, headers = headers)

print(res_index.text)

# json資料處理
```
json.loads()
```

In [None]:
#取得json資料
import requests
import json
url = ''
res = requests.get(url)
json_str = res.text
#json.loads()將字串轉換成字串或字典
json_data = json.loads(json_str)

#取得json的內容
for d in json_data:
    print(d)

#取得key名稱
for k in json_data[0]:
    print(k)

# 下載圖片

In [None]:
from urllib import request
request.urlretrieve(圖片網址, 存放目錄)
'''
也可以
from urllib.request import urlretrieve
urlretrieve(圖片網址, 存放目錄)
'''

# jieba斷詞
- 三種斷詞模式
    - 全模式 cut(cut_all = True)
    - 精確模式(預設模式) cut(cut_all = False) 
    - 搜尋引擎模式 cut_for_search()

In [None]:
import jieba
import os

#載入自定義辭典
jieba.load_userdict('檔案路徑')
#載入停用詞
stopword_path = '檔案位置'
stopword_list = []
with open(stopword_path, 'r', encoding='utf-8') as file:
    for each_line in f.readlines():
        stopword_list.append(each_line.replace('\n', ''))

s_list = jieba.cut("文本")
word_count = {}
for i in s_list:
    if i in word_count:
        word_count[i] += 1
    else:
        word_count[i] = 1

'''
計算文本內詞語出現次數
只保留長度大於1的詞和不在停用詞裡的key
儲存成list，由key, value的tuple組成
'''
word_list = [(k, word_count[k]) for k in word_count if len(k) > 1 and k not in stopword_list]

# pandas
- 將資料結構化

In [None]:
import pandas as pd

#宣告DataFrame
#定義column
df = pd.DataFrame(columns = ['Name', 'Age', 'Height'])

#插入row
df.loc[0] = ['Alan', '18', '170']
df.loc[1] = ['Wade', '17', '175']
df.loc[2] = ['Alice', '17', '162']

#新增column
df['Weight'] = ['68', '85', '51']

#移除column
df = df.drop(['Weight'], axis=1)

#移除row
df = df.drop(2)

#修改特定位置
df['column'][row]
df['Age'][1] = '18'

#方法二:在宣告時就輸入好資料
columns = ['Name', 'Age', 'Height']
data = [
    ['Ted', '17', '171'],
    ['Judy', '17', '158']
]
new_df = pd.DataFrame(data = data, columns = columns)

#合併表格
df = df.append(new_df)
#重設index
df = df.reset_index(drop = True)

#儲存成csv
df.to_csv('./檔案名稱.csv', encoding='utf-8')
#不儲存index
df.to_csv('./檔案名稱.csv', index=0, encoding='utf-8')
#不儲存欄位
df.to_csv('./檔案名稱.csv', index=0, encoding='utf-8', header=None)
#另一種編碼，避免在excel開啟時亂碼
df.to_csv('./檔案名稱.csv', index=0, encoding='utf-8-sig')

#讀取csv
pd.read_csv('檔案路徑')
#如果沒有欄位
pd.read_csv('檔案路徑', header=None)

# Selenium
- 自動化網路瀏覽器操作

In [None]:
# 爬靜態網頁
from selenium.webdriver import Chrome
import requests
from bs4 import BeautifulSoup

driver = Chrome('driver檔案位置')
url = ''
driver.get(url)

#對class進行點擊
driver.find_element_by_class_name('').click()

#取得cookies
cookies = driver.get_cookies()

driver.close()

ss = requests.session()

#設定cookies
for c in cookies:
    ss.cookies.set(c['name'], c['value'])

res = ss.get('')
soup = BeautifulSoup(res.text, 'html.parser')

driver.close()
driver.quit()

In [None]:
# 動態網頁
from selenium.webdriver import Chrome
import time
import requests
from bs4 import BeautifulSoup

driver = Chrome('driver檔案位置')
url = ''
driver.get(url)
time.sleep(3)

#在搜尋欄輸入關鍵字
driver.find_element_by_tag_name('input').send_keys('關鍵字')
time.sleep(3)

#按下查詢，可以使用xpath
driver.find_element_by_xpath('').click()
time.sleep(3)

#模擬滾動頁面
driver.execute_script('document.documentElement.scrollTop=5000')
time.sleep(5)

#取得目前的html
#html = driver.execute_script("return document.getElementsByTagName('html')[0].outerHTML")
soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.close()
driver.quit()