<h2>Appendix 1 - Data Acquisition</h2>

This program makes calls to the Twitter standard search API through the module Twython [https://twython.readthedocs.io/en/latest/]. It was used to scrape over 300,000 unique tweets made during the 2018 State of the Union address, including some useful metadata.

As the Twitter standard search API can only be used to search for tweets made within the last 7-10 days, the program is not currently functional.

In [1]:
# Import necessary modules, including Twython module to handle interface with Twitter standard search API
from twython import Twython
import json
import time
import pandas as pd

# Load in file containing twitter API credentials
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

In [2]:
# Create twython object to handle queries
python_tweets = Twython(creds["CONSUMER_KEY"], creds["CONSUMER_SECRET"])

In [3]:
# Total tweets to fetch
tweets_to_fetch_count = 300000

# Controls when to pause the program to ensure Twitter standard search API call limit is not breached
request_reset_mark = 10000

# Dictionary to populate with tweet data, to be turned into a dataframe later
data = {"user": [],
        "date": [],
        "text": [],
        "favorite_count": [],
        "id_str": [],
        "location": [],
        "retweet_count": [],
        "followers_count": [],
        "statuses_count": [],
        "verified": [],
        "description": [],
        "coordinates": [],
       }

In [None]:
# Function determines the fields of scraped tweets that will be appended to our data
def run_query():

    if ("RT @" not in status["text"]):  # Ignore retweets, so only unique tweets are included
        data["user"].append(status["user"]["screen_name"])
        data["date"].append(status["created_at"])
        data["text"].append(status["text"])
        data["favorite_count"].append(status["favorite_count"])
        data["id_str"].append(status["id_str"])
        data["location"].append(status["user"]["location"])
        data["retweet_count"].append(status["retweet_count"])
        data["followers_count"].append(status["user"]["followers_count"])
        data["statuses_count"].append(status["user"]["statuses_count"])
        data["verified"].append(status["user"]["verified"])
        data["description"].append(status["user"]["description"])

for i in range (0, 30000):
    
    # Stop program following the call that exceeds 300,000 tweets scraped
    if(len(data["id_str"]) > tweets_to_fetch_count):  
        break
    
    # Pause the program for 15 minutes for every 10,000 tweets scraped, to avoid exceeding API call limit
    if(len(data["id_str"]) > request_reset_mark):
        time.sleep(60*15)
        request_reset_mark += 10000
    
    # Set first query
    if(i == 0):
        query = {"q": "#sotu",
                 "lang": "en",
                 "until": "2018-02-01",
                 
                 # The maximum id of the first tweet to scrape. The tweet id is a unique identifier for all tweets
                 # This max_id value represents a point shortly after the end of the SOTU address 
                 # It was attained through trial and error 
                 "max_id": 958543500000000000
                }
        
        # Call a search with the twython object, appending relevant information to data as specified in run_query
        for status in python_tweets.search(**query, count = 100)["statuses"]:
            run_query()
    
    # Set remaining queries
    else:
        query = {"q": "#sotu",
                 "lang": "en",
                 "until": "2018-02-01",
                 "max_id": max_id
                }
        
        # Call a search with the twython object, appending relevant information to data as specified in run_query
        for status in python_tweets.search(**query, count = 100)["statuses"]:
            run_query()
    
    # Set the new max_id to the lowest id attained so far
    # This ensures the program moves backwards through tweets from the specified starting id
    max_id = min(data["id_str"])

In [None]:
# Convert data into a DataFrame
frame = pd.DataFrame(data)
frame

In [6]:
# Write data out to an excel file
writer = pd.ExcelWriter('output.xlsx')
frame.to_excel(writer,'Sheet1')
writer.save()