In [None]:
!pip install tweepy

## Scraping Twitter

In this notebook, we make use of [Tweepy](https://www.tweepy.org/) to download tweets from [Twitter](https://twitter.com/).  
Do note that at the point of creating this notebook, Twitter API is transiting to v2.  

  
<div style="text-align:center"><a href="https://www.atoti.io/?utm_source=gallery&utm_content=twitter1" target="_blank" rel="noopener noreferrer"><img src="https://data.atoti.io/notebooks/banners/Discover+Atoti+now.jpg" alt="atoti" /></a></div>

In [1]:
import datetime
import time

import pandas as pd
import tweepy

In [2]:
tweepy.__version__

'3.9.0'

Below are the data that we are going to capture from the downloaded tweets.

In [None]:
def get_df():
    return pd.DataFrame(
        columns=[
            "tweet_id",
            "name",
            "screen_name",
            "retweet_count",
            "text",
            "mined_at",
            "created_at",
            "favourite_count",
            "hashtags",
            "status_count",
            "followers_count",
            "location",
            "source_device",
        ]
    )

### Twitter developer account and authentication

Before starting out, remember to get a [Twitter developer account](https://developer.twitter.com/en/docs/apps/overview) from its [Developer portal](https://developer.twitter.com/en) if you haven't.  
Refer to the [Twitter API documentation](https://developer.twitter.com/en/docs/authentication/oauth-1-0a) on how to get the access tokens to be set under the `twitter_keys` below:

The [standard search API](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/overview) from Twitter API v1.1 searches against sampling of recent Tweets published in the past 7 days. This will be replaced by the [recent search](https://developer.twitter.com/en/docs/twitter-api/tweets/search/introduction) endpoint in v2.  

This search is not exhausive. Alternatively, if you have the tweet id, you can always pass the array of id to [`api.statuses_lookup()`](http://docs.tweepy.org/en/v3.5.0/api.html#API.statuses_lookup) to retrieve the historical tweets. You can find the list of tweets used in this series of notebook [here](https://s3.eu-west-3.amazonaws.com/data.atoti.io/notebooks/twitter/tweets_sentiments.csv) alongside with the sentiments at the point of data collection.
Remember to set `wait_on_rate_limit` to true so that exception won't be thrown when the rate limits are hit.

In [None]:
class TweetMiner(object):
    result_limit = 20
    data = []
    api = False

    twitter_keys = {
        "consumer_key": "<To be replace>",
        "consumer_secret": "<To be replace>",
        "access_token_key": "<To be replace>",
        "access_token_secret": "<To be replace>",
    }

    def __init__(self, keys_dict=twitter_keys, api=api):
        self.twitter_keys = keys_dict

        auth = tweepy.OAuthHandler(
            keys_dict["consumer_key"], keys_dict["consumer_secret"]
        )
        auth.set_access_token(
            keys_dict["access_token_key"], keys_dict["access_token_secret"]
        )

        self.api = tweepy.API(
            auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True
        )
        self.twitter_keys = keys_dict

    def mine_crypto_currency_tweets(self, query="BTC"):
        last_tweet_id = False
        page_num = 1

        data = get_df()
        cypto_query = f"#{query}"
        print(" ===== ", query, cypto_query)
        for page in tweepy.Cursor(
            self.api.search,
            q=cypto_query,
            lang="en",
            tweet_mode="extended",
            count=200,  # max_id=1295144957439690000
        ).pages():
            print(" ...... new page", page_num)
            page_num += 1

            for item in page:
                mined = {
                    "tweet_id": item.id,
                    "name": item.user.name,
                    "screen_name": item.user.screen_name,
                    "retweet_count": item.retweet_count,
                    "text": item.full_text,
                    "mined_at": datetime.datetime.now(),
                    "created_at": item.created_at,
                    "favourite_count": item.favorite_count,
                    "hashtags": item.entities["hashtags"],
                    "status_count": item.user.statuses_count,
                    "followers_count": item.user.followers_count,
                    "location": item.place,
                    "source_device": item.source,
                }

                try:
                    mined["retweet_text"] = item.retweeted_status.full_text
                except:
                    mined["retweet_text"] = "None"

                last_tweet_id = item.id
                data = data.append(mined, ignore_index=True)

            if page_num % 180 == 0:
                date_label = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                print("....... outputting to csv", page_num, len(data))
                data.to_csv(f"{query}_{page_num}_{date_label}.csv", index=False)
                print("  ..... resetting df")
                data = get_df()

        date_label = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        data.to_csv(f"{query}_{page_num}_{date_label}.csv", index=False)

In [None]:
miner = TweetMiner()

Below are the cryptocurrency hashtags that will be used to query. We have the option of combining query but in this case, we download the tweets for each cryptocurrency separately.

In [None]:
handle_list = [
    "BTC",
    "ETH",
    "USDT",
    "XRP",
    "BCH",
    "ADA",
    "BSV",
    "LTC",
    "LINK",
    "BNB",
    "EOS",
    "TRON",
]

We created threads to queue the mining of the cryptocurrency so that it can run unmanned for 10 iterations.  
Since the tweets are sampled, we figured we should repeat the mining to gather as much data as possible.

In [None]:
import queue
import threading

should_publish = threading.Event()
update_queue = queue.Queue()


def start_publisher():
    global handle_list

    starttime = time.time()
    print("Start polling", starttime)
    poll_iteration = 1

    for i in range(10):
        for name in handle_list:
            print(i, poll_iteration, "\rpublishing update ", end="")
            update_queue.put((poll_iteration, name))
            poll_iteration += 1
            time.sleep(900)
            print("\rawaiting for publishing update", end="")
            should_publish.wait()
            update_queue.join()


def start_update_listener():
    while True:
        poll_iteration, name = update_queue.get()

        print(" --- ", name)
        try:
            miner.mine_crypto_currency_tweets(query=name)
            update_queue.task_done()

        except Exception as e:  # work on python 3.x
            print("Failed to upload to ftp: " + str(e))


listener_thread = threading.Thread(target=start_update_listener, daemon=True)
publisher_thread = threading.Thread(target=start_publisher, daemon=True)

In [None]:
publisher_thread.start()
listener_thread.start()
# start publishing
should_publish.set()

If you would like to stop the data polling before the 10 iterations end, run the below cell.

In [None]:
# pause publishing
should_publish.clear()

  
<div style="text-align:center"><a href="https://www.atoti.io/?utm_source=gallery&utm_content=twitter1" target="_blank" rel="noopener noreferrer"><img src="https://data.atoti.io/notebooks/banners/Try+Atoti.jpg" alt="atoti" /></a></div>