# Twitter Covid Stream Tristate

* From the docs: The streaming API uses the following heuristic to determine whether a given Tweet falls within a bounding box:
    * If the coordinates field is populated, the values there will be tested against the bounding box. Note that this field uses geoJSON order (longitude, latitude).
    * If coordinates is empty but place is populated, the region defined in place is checked for intersection against the locations bounding box. Any overlap will match.
    * If none of the rules listed above match, the Tweet does not match the location query. Note that the geo field is deprecated, and ignored by the streaming API.

In [None]:
import tweepy

import pandas as pd
import math
import numpy as np
import scipy as sp

import re
import os
import sys
import json
import time

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',200)
pd.set_option('precision',4)

## Program Parameters

In [None]:
# Access credentials
CONSUMER_KEY="REDACTED"
CONSUMER_SECRET="REDACTED"
ACCESS_KEY="REDACTED"
ACCESS_SECRET="REDACTED"

In [None]:
# Paths
BASE_PATH = '/home/ubuntu/covid/twitter/'
DATA_PATH = BASE_PATH + 'data/'
RAW_DATA_PATH = DATA_PATH + 'raw_data/'

In [None]:
# Constants
DEFAULT_SLEEP = 60
STREAM_FILTERS = ['bb','kw']
DATA_FILTERS = ['kw']

In [None]:
# Constants
GENERAL_TERMS = ['COVID','COVID19','covid_19','COVIDー19','COVID2019','Covid19us','2019-nCoV','COVD19','CODVID19','Covid-19',
 'nCoV2019','covid2020','Covid19pandemic','covid19out','covidkindness','covidiot','covidiots','Knowcovid','SARSCoV2',
 'corona','coronavirus','coronovirus','coronaviruspandemic','Coronaeffect','coronavirususa','coronavirusoutbreak',
 'coronavirusupdate','Coronapocalypse','coronapocolypse','CoronaOutbreak','the \’rona','the roni','virus','Pandemic',
 'MyPandemicSurvivalPlan','PreventEpidemics','publichealth','flattenthecurve','Quarantine','Quarantinelife',
 'quarantineactivities','Quarentineandchill','QuarantineAndChill','SocialDistancing','socialdistancingnow',
 'selfisolating','HarmReduction','LockDownSA','lockdowneffect','lockdownextension','lockdowndiaries','Stayhome',
 'istayhome','istayathome','Stayathome','StayTheFHome','stayhomestaysafe','StayAtHomeSaveLives','stayhomesavelives',
 'iwillsurvivechallenge','StayAtHomeChallenge','ViewFromMyWindow','TogetherAtHome','Withme','Alonetogether',
 'inthistogether','untiltomorrow','ImDoingFineBecause','Staysafe','see10send10','seeapupsendapup','Safehands',
 'handwashing','Handwashing','washyourhands','workfromhome','Peoplehavetowork','essentialworkers',
 'ThanksHealthHeroes','healthcareheroes','GetMePPE','Mask','facemasks','Pdoh','Sdoh','hiap']
GENERAL_TERMS = [word.lower() for word in GENERAL_TERMS]

FOOD_TERMS = ['TooSmallToFail','SaveAmericanHospitality','SaveRestaurants','RestaurantRecovery',
              'ReliefForRestaurants','RallyForRestaurants','SupportLocalRestaurants','SupportLocal',
              'SupportLocalBusiness','TheGreatAmericanTakeout','CarryOut','OrderIn','CurbSide','CurbSidePickup',
              'DineLocal','StillOpen','WereOpen']
FOOD_TERMS = [word.lower() for word in FOOD_TERMS]

POLITICAL_TERMS = ['trumpownseverydeath','Trumpliedpeopledied','Trumpliesamericansdie','trumpliespeopledie',
                   'Wisconsinpandemicvoting','Trumpgenocide','trumppandemic','gopgenocide']
POLITICAL_TERMS = [word.lower() for word in POLITICAL_TERMS]

STIGMA_TERMS = ['HateIsAVirus','WashTheHate','RacismIsAVirus','IAmNotCOVID19']
STIGMA_TERMS = [word.lower() for word in STIGMA_TERMS]

CONSPIRACY_TERMS = ['filmyourhospital','filmyourhospitals','filmyourhospitalchallenge','emptyhospital','dempanic',
                    'plandemic','5gkills','5gconspiracy']
CONSPIRACY_TERMS = [word.lower() for word in CONSPIRACY_TERMS]

SEARCH_TERMS = GENERAL_TERMS+FOOD_TERMS+POLITICAL_TERMS+STIGMA_TERMS+CONSPIRACY_TERMS
SEARCH_TERMS_LOWER = [word.lower() for word in SEARCH_TERMS]
SEARCH_STRING = '|'.join(SEARCH_TERMS).lower()

In [None]:
# Authentication
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)

## Class and Function Definitions

In [None]:
class CustomStreamListener(tweepy.StreamListener):
    def __init__(self):
        super().__init__()
    
    def on_status(self, status):
        # Filter tweets by keyword
        if 'extended_tweet' in status._json.keys():
            tweet_text = status._json['extended_tweet']['full_text'].lower()
        else:
            tweet_text = status.text.lower()
        keywords = [word for word in SEARCH_TERMS_LOWER if word in tweet_text]
        if len(keywords) > 0:
            process_tweet(status,keywords)

    def on_error(self, status_code):
        print('Encountered error with status code:', status_code, file=sys.stderr)
        return True # Don't kill the stream

    def on_timeout(self):
        print('Timeout...',file=sys.stderr)
        return True # Don't kill the stream

In [None]:
def process_tweet(status, keywords=None):
    print('Collecting status data...')
    status_json = json.dumps(status._json)
    keyword_str = ','.join(keywords)
    pd.DataFrame([[status_json,keyword_str]]).to_csv(RAW_DATA_PATH+'raw_data.csv', mode='a', header=False)
    return

## Main Program
* Bounding box filter for philadelphia-tristate area

In [None]:
while(True):
    try:
        sapi = tweepy.streaming.Stream(auth, CustomStreamListener())    
        sapi.filter(locations=[-75.9814453125,39.53370327008705,-74.37744140625,40.43858586704331])
    except:
        print('Connection interrupted! Sleeping for', DEFAULT_SLEEP, 'seconds...')
        time.sleep(DEFAULT_SLEEP)

In [1]:
# AWS cost calculations for US stream
days=365
ingress = 150 # gb/day

glacier_cost_ammortized=0
standard_cost_ammortized=0

gb_month_glacier = 0.00099 # $/gb-month
gb_month_standard = 0.023

gb_day_cost_glacier = (gb_month_glacier/30)
gb_day_cost_standard = (gb_month_standard/30)

day_cost_glacier = gb_day_cost_glacier*ingress
day_cost_standard = gb_day_cost_standard*ingress

for i in range(1,days):
    glacier_cost_ammortized+= day_cost_glacier*i
    standard_cost_ammortized+=day_cost_standard*i

In [3]:
print('Glacier: Ammortized')
print('Avg. Monthly Cost:',glacier_cost_ammortized/12)
print('Total Cost:',glacier_cost_ammortized)

Glacier: Ammortized
Avg. Monthly Cost: 27.402375000000006
Total Cost: 328.8285000000001


In [5]:
print('Glacier: Bulk')
print('Avg. Monthly Cost:',ingress*days*gb_month_glacier)
print('Total Cost:',ingress*days*gb_month_glacier*12)

Glacier: Bulk
Avg. Monthly Cost: 54.2025
Total Cost: 650.4300000000001


In [7]:
print('Standard: Ammortized')
print('Avg. Monthly Cost:',standard_cost_ammortized/12)
print('Total Cost:',standard_cost_ammortized)

Standard: Ammortized
Avg. Monthly Cost: 636.6208333333333
Total Cost: 7639.45


In [9]:
print('Standard: Bulk')
print('Avg. Monthly Cost:',ingress*days*gb_month_standard)
print('Total Cost:',ingress*days*gb_month_standard*12)

Standard: Bulk
Avg. Monthly Cost: 1259.25
Total Cost: 15111.0


## Unused / Deprecated Code