#### Scraping Tweets to Study the Military & Race Relations During the 2020 #BlackLivesMatter Protests 

Brandon Kramer, Chase Dawson, Ed Gitre, Gizem Korkmaz 

In [20]:
# load modules 
import requests
import os
import json
import pandas as pd
from dotenv import load_dotenv
# to install dotenv run "conda install -c conda-forge python-dotenv" if using a conda env or "pip install python-dotenv" if in a regular python env
# link to python-dotenv documentation: https://github.com/theskumar/python-dotenv/

# load variables stored in .env
load_dotenv()

# set functions to scrape our query for set time period 
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

# query API and return json given a url, query parameters, and API access token (or bearer token)
def make_api_request(url, query_params, bearer_token):
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers, query_params)
    return json_response

def make_dataframe(json, query_params, columns=['id', 'text']):
    df = pd.DataFrame(json['data'], columns=columns)
    df['query_params'] = [query_params] * len(df)
    return df

In [13]:
# load the bearer_token and set twitter's archive as the url (academic research only) 
bearer_token = os.environ.get("BEARER_TOKEN")
search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
#query_params = {'query': '(from:twitterdev -is:retweet) OR #twitterdev','tweet.fields': 'author_id'}
query_params = {'query': '#blacklivesmatter #fortbragg -is:retweet', 
                'start_time':'2020-05-01T01:00:00.01Z', 'end_time':'2020-09-01T01:00:00.01Z',
                'max_results': '100'
               }

In [21]:
json = make_api_request(search_url, query_params, bearer_token)
df = make_dataframe(json, query_params)
df.head()

200


Unnamed: 0,id,text,query_params
0,1286419707805851649,#MohacsiSteven works for the 849th Quarter Mas...,{'query': '#blacklivesmatter #fortbragg -is:re...
1,1285047328999641089,#CHINA #CCPVirus #COVIDー19 \n#ChrisWallace #Fa...,{'query': '#blacklivesmatter #fortbragg -is:re...
2,1284958539014971394,#CHINA #CCPVirus #COVIDー19 \n#ChrisWallace #Fa...,{'query': '#blacklivesmatter #fortbragg -is:re...
3,1284958363957239809,#CHINA #CCPVirus #COVIDー19 \n#ChrisWallace #Fa...,{'query': '#blacklivesmatter #fortbragg -is:re...
4,1284957980191002625,#CHINA #CCPVirus #COVIDー19 \n#ChrisWallace #Fa...,{'query': '#blacklivesmatter #fortbragg -is:re...


In [22]:
# to save data run either:
# df.to_pickle(filepath + '.pkl') to save as a pickle or 
# df.to_csv(filepath + '.csv') to save as a csv
# where filepath is a string representing the location where you want to save the file ex: '../data/twitter_scrape1'
# and df is a pandas DataFrame object

# text data is sometimes weird with csvs so while we are working in python saving as a pickle (.pkl) might be better