<b>Libraries</b>

In [1]:
import tweepy
import pandas as pd
import time

<b>Twitter API access for academic research</b>

In [2]:
from config import api_key, api_key_secret, access_token, access_token_secret, bearer_token

<b>Definition of search queries</b>

In [3]:
# Data extracted on 20th of March 2023

#climatescam
search_query = 'lang:en (#climatescam)'
save_file = "../data/twitter_climatescam_hashtag.pkl"

#americafirst
# search_query = 'lang:en (#americansfirst OR #americafirst)'
# save_file = "../data/twitter_americafirst_hashtag.pkl"

<b>Extract data</b>

In [4]:
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

In [5]:
# Warning: Due to the Twitter API time limits this command will take some time

tweet_object_list = []

for tweet_object in tweepy.Paginator(client.search_all_tweets, 
                                     query = search_query,
                                     user_fields = ['username', 
                                                    'public_metrics', 
                                                    'description', 
                                                    'location'],
                                     tweet_fields = ['created_at', 
                                                     'geo', 
                                                     'public_metrics', 
                                                     'text', 
                                                     'conversation_id',
                                                     'possibly_sensitive'],
                                     expansions = ['author_id', 'referenced_tweets.id'],
                                     start_time = '2022-07-01T00:00:00Z',
                                     end_time = '2023-01-01T00:00:00Z',
                                     max_results=500):
    time.sleep(1)
    
    tweet_object_list.append(tweet_object)

Rate limit exceeded. Sleeping for 9 seconds.
Rate limit exceeded. Sleeping for 18 seconds.


In [6]:
def extract_user_tweet_attributes(
    tweet_object_list: list
):
    """
    Generate dataframe include tweet and author (user) information
    """

    result = []

    user_dict = {}
    
    referenced_dict = {}

    for response in tweet_object_list:
        # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
        if response is not None:
            try:
                for user in response.includes['users']:
                    user_dict[user.id] = {'username': user.username, 
                                          'followers': user.public_metrics['followers_count'],
                                          'tweets': user.public_metrics['tweet_count'],
                                          'description': user.description,
                                          'location': user.location}

            except:
                pass


            try:
                for tweet in response.includes['tweets']:
                    referenced_dict[tweet.id] = tweet.text
            except:
                pass

            if response.data is not None:
                for tweet in response.data:
                    # For each tweet, find the author's information

                    try:
                        author_info = user_dict[tweet.author_id]
                        # Put all of the information we want to keep in a single dictionary for each tweet

                        if not tweet.referenced_tweets:
                            text = tweet.text
                            status = 'post'
                        else:
                            status = tweet.referenced_tweets[0].type
                            if tweet.referenced_tweets[0].id in referenced_dict:
                                text = referenced_dict[tweet.referenced_tweets[0].id]
                            else:
                                text = tweet.text

                        result.append({'author_id': tweet.author_id,
                                       'id': tweet.id,
                                       'status': status,
                                       'username': author_info['username'],
                                       'conversation_id': tweet['conversation_id'],
                                       'author_followers': author_info['followers'],
                                       'author_tweets': author_info['tweets'],
                                       'author_description': author_info['description'],
                                       'author_location': author_info['location'],
                                       'text': text,
                                       'created_at': tweet.created_at,
                                       'retweets': tweet.public_metrics['retweet_count'],
                                       'replies': tweet.public_metrics['reply_count'],
                                       'likes': tweet.public_metrics['like_count'],
                                       'quote_count': tweet.public_metrics['quote_count'],
                                       'possibly_sensitive': tweet['possibly_sensitive']
                                      })
                    except:
                        pass

    df = pd.DataFrame(result)
    
    return df

df = extract_user_tweet_attributes(tweet_object_list=tweet_object_list)

In [7]:
df.to_pickle(save_file)