In [4]:
import os
import json
from configparser import RawConfigParser

import requests
import pandas as pd


In [5]:
CUR_DIR = os.getcwd()
PROJ_DIR = os.path.join(CUR_DIR, os.pardir)
os.path.abspath(PROJ_DIR)

'C:\\Users\\kweni\\PycharmProjects\\dc_ndoch_2021'

In [11]:
# load config.ini and read twitter secrets
config_parser = RawConfigParser()
config_parser.read(
    os.path.join(PROJ_DIR, 'config.ini')
)
TWIT_TOKEN = f'Bearer {config_parser.get("twitter", "bearer_token")}'
REQ_HEADER = {
    'Authorization': TWIT_TOKEN
}

In [13]:
# lookup user_id for dcfireems handle
TWITTER_HANDLE = 'dcfireems'
def create_user_url(user_name=TWITTER_HANDLE):
    url = f'https://api.twitter.com/2/users/by/username/{user_name}'
    params = {}
    
    return (url, params)

In [30]:
def create_tweets_url(user_id, max_results=100):
    url = f'https://api.twitter.com/2/users/{user_id}/tweets'
    
    params = {
        'max_results': max_results,
        'expansions': 'author_id,in_reply_to_user_id',
        'tweet.fields': 'id,text,author_id,created_at'
    }
        
    return (url, params)

In [21]:
def api_req(url, headers, params, next_token=''):
    if next_token:
        params['pagination_token'] = next_token
        
    resp = requests.get(url, headers = headers, params = params)
    print('Response code: ', str(resp.status_code))
    if resp.status_code != 200:
        raise Exception(resp.status_code, resp.text)
    return resp.json()

In [23]:
# get id from username
req_url, req_params = create_user_url()
resp = api_req(req_url, REQ_HEADER, req_params)
TWIT_ID = resp['data']['id']
TWIT_ID

Response code:  200


'20069434'

In [31]:
# verify recent tweets requests provide sufficient data
req_url, req_params = create_tweets_url(user_id=TWIT_ID, max_results=5)
api_req(req_url, REQ_HEADER, req_params)

Response code:  200


{'data': [{'id': '1446627599028035587',
   'created_at': '2021-10-09T00:04:31.000Z',
   'author_id': '20069434',
   'in_reply_to_user_id': '20069434',
   'text': 'https://t.co/IzDOC4Fbtu'},
  {'id': '1446627469570760710',
   'created_at': '2021-10-09T00:04:00.000Z',
   'author_id': '20069434',
   'text': 'Students at the Sunshine Day Care Center in Ward 8 had a very special visit with #DCsBravest during #FirePreventionWeek. Not only did they learn the sounds of fire safety, they also were able to spend time getting to know the ladder truck and our Firefighters. #SaferStrongerDC https://t.co/pKQnnr4vvJ'},
  {'id': '1446619775132704768',
   'created_at': '2021-10-08T23:33:25.000Z',
   'author_id': '20069434',
   'text': '#DCsBravest have responded to a Box Alarm on the 1200 Block of Taylor Street NW. Upon arrival, @dcfireems found smoke coming from the front door of a 2 story commercial building. A fire on the first floor is contained. All searches negative. There are no injuries to repo

In [43]:
def get_all_tweets(user_id='', to_csv=True) -> pd.DataFrame:
    if not user_id:
        print('Fetching user_id...')
        req_url, req_params = create_user_url()
        resp = api_req(req_url, REQ_HEADER, req_params)
        user_id = resp['data']['id']
    
    # get first set of tweets
    print('Fetching initial tweets...')
    req_url, req_params = create_tweets_url(user_id=user_id)
    resp = api_req(req_url, REQ_HEADER, req_params)
    count = resp['meta']['result_count']
    next_token = resp['meta']['next_token']
    print(f'\tReturned {count} results and next token: {next_token}')
    newest_id = resp['meta']['newest_id']
    
    # init dataframe for storing tweets
    tweets_df = pd.DataFrame.from_records(resp['data'])
    
    # fetch remaining tweets
    while next_token:
        print(f'Fetching tweeks for {next_token}...')
        resp = api_req(req_url, REQ_HEADER, req_params, next_token)
        
        # update counts, next token, and data frame
        count += resp['meta']['result_count']
        try:
            next_token = resp['meta']['next_token']
        except KeyError:
            next_token = ''
        print(f'\tReturned {count} results and next token: {next_token}')
        
        next_df = pd.DataFrame.from_records(resp['data'])
        tweets_df = pd.concat([tweets_df, next_df])
        print('Total tweets in df:', tweets_df.shape)
        
    if to_csv:
        file_name = f'raw_dcfireems_tweets_{newest_id}.csv'
        tweets_df.to_csv(file_name, index=False)
        
    return tweets_df


In [39]:
tweets_df = get_all_tweets(user_id=TWIT_ID)
tweets_df.info()

Fetching initial tweets...
Response code:  200
	Returned 100 results and next token: 7140dibdnow9c7btw3z2ggaarcslgbe23z7u97ya02pf7
Fetching tweeks for 7140dibdnow9c7btw3z2ggaarcslgbe23z7u97ya02pf7...
Response code:  200
	Returned 200 results and next token: 7140dibdnow9c7btw3z21nftg2yhnnxas0yzr0d41o4z7
Total tweets in df: 1000
Fetching tweeks for 7140dibdnow9c7btw3z21nftg2yhnnxas0yzr0d41o4z7...
Response code:  200
	Returned 300 results and next token: 7140dibdnow9c7btw3z21j5rfz52ugxlxwhdbmcqrfwok
Total tweets in df: 1500
Fetching tweeks for 7140dibdnow9c7btw3z21j5rfz52ugxlxwhdbmcqrfwok...
Response code:  200
	Returned 400 results and next token: 7140dibdnow9c7btw3z21ctdnih15s0xrptminjru2zav
Total tweets in df: 2000
Fetching tweeks for 7140dibdnow9c7btw3z21ctdnih15s0xrptminjru2zav...
Response code:  200
	Returned 500 results and next token: 7140dibdnow9c7btw3z216fhzxjfyd58tuuhdqwyl9gl1
Total tweets in df: 2500
Fetching tweeks for 7140dibdnow9c7btw3z216fhzxjfyd58tuuhdqwyl9gl1...
Response

Unnamed: 0,created_at,author_id,in_reply_to_user_id,text,id
0,2021-10-09T00:04:31.000Z,20069434,20069434.0,https://t.co/IzDOC4Fbtu,1446627599028035587
1,2021-10-09T00:04:00.000Z,20069434,,Students at the Sunshine Day Care Center in Wa...,1446627469570760710
2,2021-10-08T23:33:25.000Z,20069434,,#DCsBravest have responded to a Box Alarm on t...,1446619775132704768
3,2021-10-08T23:10:27.000Z,20069434,,#DCsBravest were proud to help wish Ward 7 res...,1446613993381154823
4,2021-10-08T14:36:28.000Z,20069434,,"Happy Friday DC from #DCsBravest Engine 15, Re...",1446484645739175940
