## Twint vs. Tweepy
### 使用Tweepy的主要優點是它使用官方的Twitter API。這比下載推文提供了更多的功能。可以發佈推文，阻止人員，發送DM和編寫腳本，以自動轉發或關注您的人。Tweepy最大的限制還在於它使用官方的Twitter API。該API限制了可以下載的數據量，並且使用免費帳戶只能在過去7天內下載幾千條推文

### Twint是一個更年輕的庫(library)，文檔有點少。它不使用官方的Twitter API來獲取數據，而是從公共網頁位址(public webpage addresses)中抓取Twitter。它的性能似乎接近Tweepy，下載數據時沒有太大的速度差異。但不會像使用官方API那樣快，但它不使用API的最大優點是，您可以有效地下載推文，只要你想，沒有任何速率限制。

### Twint 
需要先安裝twint的套件，可參考以下網址

'pip install twint'
</n>
1. 若有Error，先執行'pip install --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint'，更新至最新版twint
2. 再嘗試後若有Error，依序執行'git clone --depth=1 https://github.com/twintproject/twint.git', 'cd twint', 'pip3 install . -r requirements.txt'
3. 再嘗試後若有Error，將requirements.txt中aiohttp改為aiohttp==3.7.0，或是把url.py中第92行'('query_source', 'typed_query')'取消註記


參考：

https://www.cammcl.com/post/downloading-twitter-data-in-python/

https://medium.com/@pragya_paudyal/scraping-tweet-using-twint-and-analyzing-with-nlp-932e01ad5587


In [53]:
# 安裝套件
import twint
# Use to handle notebook & Runtime Error
# 允許是事件迴圈已經在執行的情況下，再執行一次事件迴圈
import nest_asyncio
nest_asyncio.apply()

import os
import datetime as dt

# 如果要將資料存成csv
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

先簡單測試是否可以執行！

In [64]:
# 設定twint的配置
t = twint.Config()
# Scraping Data
# Keyword -> can use OR to search multiple words
t.Search = "Covid OR Covid-19"
# Language
t.Lang = 'en'
# start_date and end_date
t.Since = '2022-04-07 00:00:00'
t.Until = '2022-04-08 00:00:00'
# Limit tweets per times
t.Limit = 1000
# hide the scraper showing each tweet
t.Hide_output = True
# set the output file as json
t.Output = "./test.json"
t.Store_json = True
twint.run.Search(t)


In [68]:
# check there has saved data
df = pd.read_csv('./test.csv')
# print(df.columns)
df.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1514864342847864832,1514864342847864832,2022-04-15 15:12:58 台北標準時間,2022-04-15,15:12:58,800,121371581,taiwan_today,Taiwan Today,,.@MOFA_Taiwan welcomes visit of high-level #US🇺🇸 congressional delegation to #Taiwan🇹🇼 https://t.co/W5Tol69lUC,en,"[{'screen_name': 'mofa_taiwan', 'name': '外交部 ministry of foreign affairs, roc (taiwan) 🇹🇼', 'id': '976995099170062338'}]",['https://www.taiwantoday.tw/news.php?unit=2&post=217655&unitname=Politics-Top-News&postname=MOFA-welcomes-visit-of-high-level-US-congressional-delegation-to-Taiwan'],[],9,45,305,"['us', 'taiwan']",[],https://twitter.com/Taiwan_Today/status/1514864342847864832,False,,0,,,,,,,,[],,,,
1,1514863289750396930,1514863289750396930,2022-04-15 15:08:47 台北標準時間,2022-04-15,15:08:47,800,121371581,taiwan_today,Taiwan Today,,#Taiwan🇹🇼 shares marine conservation experience at #OurOcean Conference in #Palau🇵🇼 https://t.co/ikkQAbI5qC,en,[],"['https://www.taiwantoday.tw/news.php?unit=2,6&post=217643&unitname=Politics-Top-News&postname=Taiwan-shares-marine-conservation-experience-at-Our-Ocean-Conference-in-Palau']",[],3,17,82,"['taiwan', 'ourocean', 'palau']",[],https://twitter.com/Taiwan_Today/status/1514863289750396930,False,,0,,,,,,,,[],,,,
2,1514855327963160577,1514855327963160577,2022-04-15 14:37:09 台北標準時間,2022-04-15,14:37:09,800,121371581,taiwan_today,Taiwan Today,,#Taiwan🇹🇼 and #Lithuania🇱🇹 central government #Agriculture officials take part April 13 in a virtual meeting on expanding cooperation and trade exchanges between the free trading nations. (Courtesy Council of Agriculture) https://t.co/2ShIV1iaKt,en,[],[],['https://pbs.twimg.com/media/FQXSzt3aQAEsUTf.png'],7,16,178,"['taiwan', 'lithuania', 'agriculture']",[],https://twitter.com/Taiwan_Today/status/1514855327963160577,False,,1,https://pbs.twimg.com/media/FQXSzt3aQAEsUTf.png,,,,,,,[],,,,
3,1514768768236658692,1514768768236658692,2022-04-15 08:53:12 台北標準時間,2022-04-15,08:53:12,800,121371581,taiwan_today,Taiwan Today,,Traditional bamboo craft blooms again in #Taiwan🇹🇼 https://t.co/D0MFgG2dK1,en,[],['https://www.taiwantoday.tw/news.php?unit=19&post=217634&unitname=Politics-Top-News&postname=Taiwan-artist-revives-traditional-bamboo-craft'],[],2,18,117,['taiwan'],[],https://twitter.com/Taiwan_Today/status/1514768768236658692,False,,0,,,,,,,,[],,,,
4,1514507026311020545,1514507026311020545,2022-04-14 15:33:07 台北標準時間,2022-04-14,15:33:07,800,121371581,taiwan_today,Taiwan Today,,#Taiwan🇹🇼 attends #OurOcean Conference opening ceremony in #Palau🇵🇼 https://t.co/YGieyCFgSD,en,[],"['https://www.taiwantoday.tw/news.php?unit=2,6&post=217616&unitname=Politics-Top-News&postname=Taiwan-attends-Our-Ocean-Conference-opening-ceremony-in-Palau']",[],2,29,138,"['taiwan', 'ourocean', 'palau']",[],https://twitter.com/Taiwan_Today/status/1514507026311020545,False,,0,,,,,,,,[],,,,


In [69]:
# get top 10 most frequent hashtags
df['hashtags'].value_counts()[:10].sort_values(ascending=False)

['taiwan']                                     9
[]                                             6
['taoyuan', 'taiwan']                          5
['us', 'taiwan']                               4
['taiwan', 'europe']                           3
['taiwan', 'taitra', 'lithuania', 'taipei']    3
['taiwan', 'lithuania', 'taichung']            3
['yangmingshan', 'taiwan', 'taipei']           3
['taiwan', 'palau', 'ourocean']                3
['newtaipei', 'taiwan']                        3
Name: hashtags, dtype: int64

正式開始撈資料

In [None]:
def twint_search(search_term, since, until, save_path):
    c = twint.Config()
    c.Search = search_term
    # Language -> only English
    c.Lang = "en"
    # Location
    # c.Location = 
    c.Since = since.strftime('%Y-%m-%d %H:%M:%S')
    c.Until = until.strftime('%Y-%m-%d %H:%M:%S')
    # Limit for each data
    # c.Limit = 10000
    c.Hide_output = True
    # store data as csv
    c.Store_csv = True
    # set path to save data
    c.Output = save_path
    twint.run.Search(c)
    
def twint_search_loop(search_term, start_date, end_date, save_dir):
    # create file to save data
    try:
        os.makedirs(os.path.join(os.getcwd(),save_dir,search_term))
        print(f'Successfully created the directory {os.path.join(os.getcwd(),save_dir,search_term)}')
    except FileExistsError:
        print(f'Directory {os.path.join(os.getcwd(),save_dir,search_term)} already exists')
    
    date_range = pd.date_range(start_date, end_date)
    
    for single_date in date_range:
        since = single_date
        until = single_date + dt.timedelta(days=1)
        save_path = os.path.join(save_dir, search_term, f'{single_date:%Y%m%d}.csv')
        print(f"Searching for tweets containing '{search_term}' from {single_date:%Y-%m-%d} and saving into {save_path}")
        twint_search(search_term, since, until, save_path)

In [None]:
# Keyword -> can use OR to search multiple words
search_term = 'Covid'
# Scrape how many dates' data
# test
start_date = dt.datetime(2022, 4, 13)
end_date = dt.datetime(2022, 4, 14)
# official
# start_date = dt.datetime(2019, 11, 1)
# end_date = dt.datetime(2022, 3, 31)
# dir to save data
save_dir = 'D:/Twitter data/raw/'

# run search
twint_search_loop(search_term, start_date, end_date, save_dir)