In [110]:
import time
import json
import csv
import random

import numpy as np

import requests
from requests_oauthlib import OAuth1

In [2]:
# load API access tokens
from SECRET_DO_NOT_PUBLISH import *

In [3]:
auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

In [23]:
# using count=60 to limit to at most 60 tweets, as in the main paper

endpoint = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name={}&count=60&tweet_mode=extended&trim_user=true'

# EXPLANATION OF PARAMS
# screen_name : user to fetch
# count       : number of tweets to limit to (max 200)
# tweet_mode  : need this to return non-truncated tweets
# trim_user   : don't need to return the complete users' details

In [72]:
def get_history(screen_name):
    response = requests.get(endpoint.format(screen_name), auth=auth)
    
#     if response.status_code != 200:

    response.raise_for_status()
        
    
    # parse out just the text of the posts
    return [{'body' : post['full_text']} for post in response.json()]

# load users to request

In [73]:
users_to_request = []

with open('random_users_full.csv') as f:
    reader = csv.reader(f)
    
    for row in reader:
        users_to_request.append( row[0] )
        
print(f'Loaded {len(users_to_request):,d} usernames from file.')

Loaded 9,817 usernames from file.


In [74]:
TESTING_MODE = False

if TESTING_MODE:
    users_to_request = users_to_request[:20]
    print('TESTING MODE: ignoring most users')

TESTING MODE: ignoring most users


In [75]:
success_count = 0
error_count   = 0
tweet_count   = 0

start_time    = time.monotonic()

with open('twitter_posts.jsonl', 'w') as f, open('error_names.txt', 'w') as errorfile:
    for i, screen_name in enumerate(users_to_request):
        print(f'[{i+1:04d} / {len(users_to_request):04d}] Requesting history for {screen_name+"...":<20}', end='')
        
        try:
            history = get_history(screen_name)
            f.write( json.dumps({screen_name : history})+'\n' )
            print(f' wrote {len(history):02d} tweets to file.')
            
            success_count += 1
            tweet_count   += len(history)
            
        except Exception as e:
            print(f' Error!')
            print('')
            print(e)
            print('')
            
            errorfile.write(screen_name+'\n')
            
            error_count += 1
            
        time.sleep(1)
            
print(f'Finished in {(time.monotonic()-start_time)/(60**2):6.2f} hours. Saved {tweet_count:,d} tweets from {success_count:,d} users.')
print(f'Failed on {error_count:,d} users.')

[0001 / 0020] Requesting history for Nik____ka...         wrote 60 tweets to file.
[0002 / 0020] Requesting history for EmbraceMeHere...     wrote 60 tweets to file.
[0003 / 0020] Requesting history for realmattcooke...     wrote 60 tweets to file.
[0004 / 0020] Requesting history for ItsTexasT03...       Error!

401 Client Error: Authorization Required for url: https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=ItsTexasT03&count=60&tweet_mode=extended&trim_user=true

[0005 / 0020] Requesting history for CjWilson850...       wrote 60 tweets to file.
[0006 / 0020] Requesting history for TotalTrafficSEA...   wrote 60 tweets to file.
[0007 / 0020] Requesting history for TheHobbySpyder...    Error!

404 Client Error: Not Found for url: https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=TheHobbySpyder&count=60&tweet_mode=extended&trim_user=true

[0008 / 0020] Requesting history for 35mmPapi...          wrote 60 tweets to file.
[0009 / 0020] Requesting histo

# load and and resave posts as numpy

now that we've finished scraping.

In [103]:
data       = {}
num_tweets = 0

with open('twitter_posts.jsonl') as f:
    for line in f:
        obj = json.loads(line)
        
        for screen_name, history in obj.items():
            data[screen_name] = history
            
            num_tweets += len(history)

print(f'Loaded {len(data):,d} histories from file. {num_tweets:,d} tweets in total.')


Loaded 8,927 histories from file. 507,113 tweets in total.


In [108]:
data = {screen_name:history for screen_name, history in data.items() if len(history)>=10}
print(f'Filtered down to {len(data):,d} histories with at least 10 tweets.')
print(f'{sum([len(h) for h in data.values()]):,d} tweets total.')

Filtered down to 8,764 histories with at least 10 tweets.
506,378 tweets total.


In [114]:
data = {screen_name:history for screen_name, history in random.sample(data.items(), k=8000)}
print(f'Randomly sampled down to {len(data):,d} histories.')
print(f'{sum([len(h) for h in data.values()]):,d} tweets total.')

Randomly sampled down to 8,000 histories.
462,108 tweets total.


In [116]:
np.save('twitter_posts.npy', np.array( [data] ))
print('Rewrote posts to .npy format.')

Rewrote posts to .npy format.


In [94]:
# posts = np.load('twitter_posts.npy', allow_pickle=True)