In [10]:
# Collect relevant tweets through the Twitter API.
import json
import tweepy as tw

In [11]:
# IMPORTANT: enter proper access credential in config_twitter.py file
import config_twitter

In [12]:
# function to establish an initial API connection, respecting the rate limit
def connect_api_client():
    auth = tw.OAuthHandler(config_twitter.consumer_key, config_twitter.consumer_secret)
    auth.set_access_token(config_twitter.access_token, config_twitter.access_token_secret)
    # https://docs.tweepy.org/en/stable/getting_started.html#api
    api = tw.API(auth, wait_on_rate_limit=True)
    try:
        # returns False if credentials could not be verified
        # https://docs.tweepy.org/en/stable/api.html#API.verify_credentials
        api.verify_credentials()
        user = api.verify_credentials()
        if not user:
            raise("Credentials could not be verified: Please check config.py")
        print(f"Connected to Twitter API as {user.name}")
    except Exception as e:
        raise e
    return api

In [13]:
api = connect_api_client()

Connected to Twitter API as Allan Bravos


In [14]:
# construct a search query
query = '"artificial intelligence" OR "machine learning" OR "deep learning" -filter:retweets'

In [15]:
# decide how many tweets to query
###TODO increase this value later to collect a good dataset (try 2000 for instance)
ntweets = 2000

In [16]:
# search and collect relevant tweets
# https://docs.tweepy.org/en/stable/cursor_tutorial.html
# https://docs.tweepy.org/en/stable/code_snippet.html
tweets = [tweet._json for tweet in tw.Cursor(api.search_tweets, q=query, lang="en", tweet_mode='extended').items(ntweets)]
len(tweets)

2000

In [17]:
# example tweet content (json structure)
tweets[0]

{'created_at': 'Tue Aug 02 00:31:03 +0000 2022',
 'id': 1554263471298068480,
 'id_str': '1554263471298068480',
 'full_text': 'In this post, we have added a facial identity-based authentication user interface to show a complete end-to-end identity verification solution. \n\n#AWS #AWSBlog #Serverless #APIGateway #StepFunction #Rekognition #Cloud #CloudComputing\n https://t.co/FfWEPQcb9a https://t.co/qbaMxmLaMx',
 'truncated': False,
 'display_text_range': [0, 258],
 'entities': {'hashtags': [{'text': 'AWS', 'indices': [146, 150]},
   {'text': 'AWSBlog', 'indices': [151, 159]},
   {'text': 'Serverless', 'indices': [160, 171]},
   {'text': 'APIGateway', 'indices': [172, 183]},
   {'text': 'StepFunction', 'indices': [184, 197]},
   {'text': 'Rekognition', 'indices': [198, 210]},
   {'text': 'Cloud', 'indices': [211, 217]},
   {'text': 'CloudComputing', 'indices': [218, 233]}],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/FfWEPQcb9a',
    'expanded_url': 'https://

In [18]:
# save tweets data to json file
file_out = f"raw_tweet_data_{ntweets}.json"
with open(file_out, mode='w') as f:
    f.write(json.dumps(tweets, indent=2))