Permalink
Cannot retrieve contributors at this time
# Author: Alexis Perrier <alexis.perrier@gmail.com> | |
# License: BSD 3 clause | |
# Python 3 | |
''' | |
Gets data from Twitter. | |
Requires Twython and pymongo (and mongo Db running) | |
Python 3.4 | |
Usage: | |
Get follower IDs for an account that has 5000 followers | |
python twitter.py --followers --screen_name berkleecollege --n_followers=5000 | |
Get timelines of followers (limit to 100 TLs): | |
python twitter.py --timelines --screen_name berkleecollege --n_followers=100 | |
The script assumes the account name exists | |
''' | |
from __future__ import print_function | |
import logging | |
import numpy as np | |
import sys | |
from optparse import OptionParser | |
from configparser import ConfigParser | |
import time | |
from pymongo import MongoClient | |
from twython import Twython, TwythonRateLimitError | |
from dateutil import parser | |
import datetime as dt | |
from datetime import datetime | |
def is_recent(twt): | |
'''Checks that the tweet is more recent than n_days''' | |
return parser.parse(twt['created_at']).replace(tzinfo=None) > \ | |
(dt.datetime.today() - dt.timedelta(days=n_days)) | |
def corpus_status(screen_name): | |
''' State of the stored corpus: number of documents, average length | |
and number of tweets per Timeline''' | |
print("\n-------- Corpus --------") | |
timelines = db.tweets.find({'parent':screen_name}) | |
documents = [tw['raw_text'] for tw in timelines] | |
print(" We have " + str(len(documents)) + " documents ") | |
timelines.rewind() | |
n_tweets = sum([tw['n_tweets'] for tw in timelines | |
if 'n_tweets' in tw.keys() and tw['n_tweets'] >0 ]) | |
print() | |
print(" Total number of tweets: ", n_tweets) | |
print(" On average #tweets per document: %0.2f" % | |
(n_tweets / len(documents))) | |
timelines.rewind() | |
len_text = [tw['len_text'] for tw in timelines | |
if 'len_text' in tw.keys() and tw['len_text'] > 0] | |
m_len_text = np.mean(len_text) | |
print(" Text length: Mean: %0.2f STD: %0.2f" | |
% (np.mean(len_text), np.std(len_text)) ) | |
print() | |
timelines.rewind() | |
above_avg = [tw for tw in timelines | |
if 'len_text' in tw.keys() and tw['len_text'] > m_len_text] | |
print(" => %0.2f documents above average: " % len(above_avg) ) | |
def followers_status(screen_name): | |
followers = db.followers.find_one({"screen_name": screen_name}) | |
print("We have %s follower IDs for %s" % | |
(len(followers['ids']), screen_name)) | |
def wait_for_awhile(): | |
reset = int(twitter.get_lastfunction_header('x-rate-limit-reset')) | |
wait = max(reset - time.time(), 0) + 10 | |
print("Rate limit exceeded waiting: %sm %0.0fs"% | |
(int(int( wait)/60),wait % 60 )) | |
time.sleep(wait) | |
print(__doc__) | |
# Display progress logs on stdout | |
logging.basicConfig(level=logging.INFO, | |
format='>>> %(asctime)s %(levelname)s %(message)s') | |
# --------------------------------------------------------- | |
# parse commandline arguments | |
# --------------------------------------------------------- | |
op = OptionParser() | |
op.add_option("-s", "--screen_name", | |
dest="screen_name", type="string", | |
help="Screen name of the main account") | |
op.add_option("-f", "--followers", | |
action="store_true", dest="followers", default=False, | |
help="Extracts IDs of screen_name followers from Twitter") | |
op.add_option("-t", "--timelines", | |
action="store_true", dest="timelines", default=False, | |
help="Extracts timelines of the followers from Twitter") | |
op.add_option("-d", "--dbname", dest="dbname", default='twitter', | |
help="Name of the MongDB collection") | |
op.add_option("-n", "--n_followers", dest="n_followers", default='5000', | |
help="Number of follower IDs; 5000 at a time") | |
# Initialize | |
(opts, args) = op.parse_args() | |
print(opts) | |
screen_name = opts.screen_name.lower() # The main twitter account | |
n_days = 180 # Only tweets more recent than n_days are kept | |
n_followers = int(opts.n_followers) | |
# --------------------------------------------------------- | |
# Twitter Connection: credentials stored in twitter.cfg | |
# --------------------------------------------------------- | |
config = ConfigParser() | |
config.read('twitter.cfg') | |
# for py27 change config to get_config | |
APP_KEY = config['credentials']['app_key'] | |
APP_SECRET = config['credentials']['app_secret'] | |
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) | |
ACCESS_TOKEN = twitter.obtain_access_token() | |
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) | |
# MongoDB connection | |
client = MongoClient() | |
db = client[opts.dbname] | |
# ----------------------------------------------------------- | |
# Follower IDs | |
# ----------------------------------------------------------- | |
if opts.followers: | |
followers_query_size = min(n_followers,5000) # Twitter default | |
print("Retrieving %s followers" % n_followers) | |
# ------------------------------------------------------------------ | |
# 1) get follower ids | |
# see https://dev.twitter.com/rest/reference/get/followers/ids | |
# ------------------------------------------------------------------ | |
# Initialize the database followers record for that screen_name | |
res = db.followers.find_one( {"screen_name": screen_name}) | |
if res is None: | |
db.followers.insert_one( {"screen_name": screen_name, "ids": []} ) | |
# cursor is used to navigate a twitter collection | |
# https://dev.twitter.com/overview/api/cursoring | |
next_cursor = -1 | |
follower_ids = list() | |
ids_count = 0 | |
while (next_cursor != 0) and ( ids_count < n_followers): | |
try: | |
print("Followers %s to %s: cursor: %s" % | |
(ids_count, ids_count + followers_query_size, next_cursor)) | |
result = twitter.get_followers_ids(screen_name = screen_name, | |
count = followers_query_size, | |
cursor = next_cursor) | |
follower_ids = follower_ids + result['ids'] | |
next_cursor = result['next_cursor'] | |
ids_count += len(result['ids']) | |
# make sure the list only has unique IDs and sort | |
follower_ids = list(set(follower_ids)) | |
follower_ids.sort() | |
print("Retrieved %s follower IDs from twitter" % len(follower_ids)) | |
# store what we've got so far | |
# insert follower_ids in the followers collection | |
res = db.followers.update_one( | |
{"screen_name": screen_name}, | |
{ '$set': {"ids": follower_ids} } | |
) | |
if res.matched_count == 0: | |
print("Unable to update IDs for: ",screen_name) | |
elif res.modified_count == 0: | |
print("%s IDs not modified"% screen_name) | |
else: | |
print("%s now has %s IDs " % (screen_name, str(len(follower_ids))) ) | |
followers_status(screen_name) | |
except TwythonRateLimitError as e: | |
# Wait if we hit the Rate limit | |
followers_status(screen_name) | |
wait_for_awhile() | |
except: | |
print(" FAILED: Unexpected error:", sys.exc_info()[0]) | |
pass | |
# followers_status(screen_name) | |
# ----------------------------------------------------------- | |
# Timelines | |
# ----------------------------------------------------------- | |
if opts.timelines: | |
# catch IDs that error out | |
error_ids = list() | |
# List of follower IDs | |
followers = db.followers.find_one({"screen_name": screen_name}) | |
print("Retrieving timelines of %s followers" % len(followers['ids'])) | |
# Get all timelines or limited to n_followers | |
if n_followers is None: | |
follower_ids = followers['ids'] | |
else: | |
follower_ids = followers['ids'][0:n_followers-1] | |
for id in follower_ids: | |
try: | |
# get the tweets for that account's timeline | |
params = {'user_id': id, 'count': 200, | |
'contributor_details': 'true' } | |
timeline = twitter.get_user_timeline(**params) | |
# keep only recent_tweets | |
recent_tweets = [twt for twt in timeline if is_recent(twt)] | |
# Aggregate the tweets to create the document | |
text = ' '.join( [tw['text'] for tw in recent_tweets]) | |
item = { | |
'raw_text': text, | |
'user_id': id, | |
'len_text': len(text), | |
'n_tweets': len(recent_tweets), | |
'screen_name': timeline[0]['user']['screen_name'], | |
'lang': timeline[0]['lang'], | |
'parent': screen_name, | |
} | |
# do we already have this account in the db? | |
twt = db.tweets.find({'user_id':id, 'parent': screen_name}) | |
# if we do, update the data else create a new entry | |
if twt.count() == 0: | |
# store document | |
print("New account:",timeline[0]['user']['screen_name'], | |
id,len(recent_tweets), timeline[0]['lang']) | |
db.tweets.insert_one(item) | |
else: | |
# update the existing account record | |
res = db.tweets.replace_one( | |
{'user_id':id, 'parent': screen_name}, item | |
) | |
# result of the update | |
if res.matched_count == 0: | |
print("no match for id: ",id) | |
elif res.modified_count == 0: | |
print("no modification for id: ",id) | |
else: | |
print("replaced ",timeline[0]['user']['screen_name'], | |
id,len(recent_tweets), timeline[0]['lang'] ) | |
except TwythonRateLimitError as e: | |
# Wait if we hit the Rate limit | |
corpus_status(screen_name) | |
wait_for_awhile() | |
except: | |
# Keep track of the ID that errored out | |
error_ids.append(id) | |
print(" FAILED:", id) | |
print("Unexpected error:", sys.exc_info()[0]) | |
pass | |
# --------------------------------------------------------- | |
# check how many documents we now have in the Database | |
# --------------------------------------------------------- | |
print("The following IDs errored out:", str(error_ids)) | |
corpus_status(screen_name) |