In [48]:
import os
import json
import re
from configparser import RawConfigParser

import requests
import pandas as pd


In [5]:
CUR_DIR = os.getcwd()
PROJ_DIR = os.path.join(CUR_DIR, os.pardir)
os.path.abspath(PROJ_DIR)

'C:\\Users\\kweni\\PycharmProjects\\dc_ndoch_2021'

In [11]:
# load config.ini and read twitter secrets
config_parser = RawConfigParser()
config_parser.read(
    os.path.join(PROJ_DIR, 'config.ini')
)
TWIT_TOKEN = f'Bearer {config_parser.get("twitter", "bearer_token")}'
REQ_HEADER = {
    'Authorization': TWIT_TOKEN
}

In [13]:
# lookup user_id for dcfireems handle
TWITTER_HANDLE = 'dcfireems'
def create_user_url(user_name=TWITTER_HANDLE):
    url = f'https://api.twitter.com/2/users/by/username/{user_name}'
    params = {}
    
    return (url, params)

In [30]:
def create_tweets_url(user_id, max_results=100):
    url = f'https://api.twitter.com/2/users/{user_id}/tweets'
    
    params = {
        'max_results': max_results,
        'expansions': 'author_id,in_reply_to_user_id',
        'tweet.fields': 'id,text,author_id,created_at'
    }
        
    return (url, params)

In [21]:
def api_req(url, headers, params, next_token=''):
    if next_token:
        params['pagination_token'] = next_token
        
    resp = requests.get(url, headers = headers, params = params)
    print('Response code: ', str(resp.status_code))
    if resp.status_code != 200:
        raise Exception(resp.status_code, resp.text)
    return resp.json()

In [23]:
# get id from username
req_url, req_params = create_user_url()
resp = api_req(req_url, REQ_HEADER, req_params)
TWIT_ID = resp['data']['id']
TWIT_ID

Response code:  200


'20069434'

In [31]:
# verify recent tweets requests provide sufficient data
req_url, req_params = create_tweets_url(user_id=TWIT_ID, max_results=5)
api_req(req_url, REQ_HEADER, req_params)

Response code:  200


{'data': [{'id': '1446627599028035587',
   'created_at': '2021-10-09T00:04:31.000Z',
   'author_id': '20069434',
   'in_reply_to_user_id': '20069434',
   'text': 'https://t.co/IzDOC4Fbtu'},
  {'id': '1446627469570760710',
   'created_at': '2021-10-09T00:04:00.000Z',
   'author_id': '20069434',
   'text': 'Students at the Sunshine Day Care Center in Ward 8 had a very special visit with #DCsBravest during #FirePreventionWeek. Not only did they learn the sounds of fire safety, they also were able to spend time getting to know the ladder truck and our Firefighters. #SaferStrongerDC https://t.co/pKQnnr4vvJ'},
  {'id': '1446619775132704768',
   'created_at': '2021-10-08T23:33:25.000Z',
   'author_id': '20069434',
   'text': '#DCsBravest have responded to a Box Alarm on the 1200 Block of Taylor Street NW. Upon arrival, @dcfireems found smoke coming from the front door of a 2 story commercial building. A fire on the first floor is contained. All searches negative. There are no injuries to repo

In [67]:
def get_all_tweets(user_id='', to_csv=True) -> pd.DataFrame:
    if not user_id:
        print('Fetching user_id...')
        req_url, req_params = create_user_url()
        resp = api_req(req_url, REQ_HEADER, req_params)
        user_id = resp['data']['id']
    
    # get first set of tweets
    print('Fetching initial tweets...')
    req_url, req_params = create_tweets_url(user_id=user_id)
    resp = api_req(req_url, REQ_HEADER, req_params)
    count = resp['meta']['result_count']
    next_token = resp['meta']['next_token']
    print(f'\tReturned {count} results and next token: {next_token}')
    newest_id = resp['meta']['newest_id']
    
    # init dataframe for storing tweets
    tweets_df = pd.DataFrame.from_records(resp['data'])
    
    # fetch remaining tweets
    while next_token:
        print(f'Fetching tweeks for {next_token}...')
        resp = api_req(req_url, REQ_HEADER, req_params, next_token)
        
        # update counts, next token, and data frame
        count += resp['meta']['result_count']
        try:
            next_token = resp['meta']['next_token']
        except KeyError:
            next_token = ''
        print(f'\tReturned {count} results and next token: {next_token}')
        
        next_df = pd.DataFrame.from_records(resp['data'])
        tweets_df = pd.concat([tweets_df, next_df], ignore_index=True)
        print('Total tweets in df:', tweets_df.shape)
        
    if to_csv:
        file_name = f'raw_dcfireems_tweets_{newest_id}.csv'
        tweets_df.to_csv(file_name, index=False)
        
    return tweets_df


In [68]:
tweets_df = get_all_tweets(user_id=TWIT_ID)
tweets_df.info()

Fetching initial tweets...
Response code:  200
	Returned 100 results and next token: 7140dibdnow9c7btw3z2ggali9jgvohlrbaxxcdisa69e
Fetching tweeks for 7140dibdnow9c7btw3z2ggali9jgvohlrbaxxcdisa69e...
Response code:  200
	Returned 200 results and next token: 7140dibdnow9c7btw3z21ngprtlry0oon0utkn083pwdw
Total tweets in df: (200, 5)
Fetching tweeks for 7140dibdnow9c7btw3z21ngprtlry0oon0utkn083pwdw...
Response code:  200
	Returned 300 results and next token: 7140dibdnow9c7btw3z21j625e4ey1rbwksahtgapku9e
Total tweets in df: (300, 5)
Fetching tweeks for 7140dibdnow9c7btw3z21j625e4ey1rbwksahtgapku9e...
Response code:  200
	Returned 400 results and next token: 7140dibdnow9c7btw3z21ctdv3somk37dmt0u94a4cd2w
Total tweets in df: (400, 5)
Fetching tweeks for 7140dibdnow9c7btw3z21ctdv3somk37dmt0u94a4cd2w...
Response code:  200
	Returned 500 results and next token: 7140dibdnow9c7btw3z218i4ntkc1qu8tlwxq360q9fzt
Total tweets in df: (500, 5)
Fetching tweeks for 7140dibdnow9c7btw3z218i4ntkc1qu8tlwxq360q

In [101]:
# apply regex to find dispatch tweets
REGEX = 'responded to \d+ calls on \D+\d+\w+.\D+\d+ critical and \d+ non-critical EMS'
regex = re.compile(REGEX)
tweets_df['dispatch'] = tweets_df.text.apply(
    lambda x: True if regex.search(x) else False
)
tweets_df.dispatch.value_counts()

False    3019
True      231
Name: dispatch, dtype: int64

In [107]:
# get dataframe of only dispatch tweets
dispatch_df = tweets_df.loc[tweets_df.dispatch, :].reset_index()
dispatch_df.drop(columns='index', inplace=True)
dispatch_df.dispatch.value_counts()

True    231
Name: dispatch, dtype: int64

In [115]:
# save to file with most recent dispatch tweet as timestamp
time_stamp = dispatch_df.created_at[0][:10].replace('-', '')
dispatch_df.to_csv(f'dispatch_tweets_{time_stamp}.csv', index=False)