Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
274 lines (229 sloc) 10.2 KB
# Author: Alexis Perrier <alexis.perrier@gmail.com>
# License: BSD 3 clause
# Python 3
'''
Gets data from Twitter.
Requires Twython and pymongo (and mongo Db running)
Python 3.4
Usage:
Get follower IDs for an account that has 5000 followers
python twitter.py --followers --screen_name berkleecollege --n_followers=5000
Get timelines of followers (limit to 100 TLs):
python twitter.py --timelines --screen_name berkleecollege --n_followers=100
The script assumes the account name exists
'''
from __future__ import print_function
import logging
import numpy as np
import sys
from optparse import OptionParser
from configparser import ConfigParser
import time
from pymongo import MongoClient
from twython import Twython, TwythonRateLimitError
from dateutil import parser
import datetime as dt
from datetime import datetime
def is_recent(twt):
'''Checks that the tweet is more recent than n_days'''
return parser.parse(twt['created_at']).replace(tzinfo=None) > \
(dt.datetime.today() - dt.timedelta(days=n_days))
def corpus_status(screen_name):
''' State of the stored corpus: number of documents, average length
and number of tweets per Timeline'''
print("\n-------- Corpus --------")
timelines = db.tweets.find({'parent':screen_name})
documents = [tw['raw_text'] for tw in timelines]
print(" We have " + str(len(documents)) + " documents ")
timelines.rewind()
n_tweets = sum([tw['n_tweets'] for tw in timelines
if 'n_tweets' in tw.keys() and tw['n_tweets'] >0 ])
print()
print(" Total number of tweets: ", n_tweets)
print(" On average #tweets per document: %0.2f" %
(n_tweets / len(documents)))
timelines.rewind()
len_text = [tw['len_text'] for tw in timelines
if 'len_text' in tw.keys() and tw['len_text'] > 0]
m_len_text = np.mean(len_text)
print(" Text length: Mean: %0.2f STD: %0.2f"
% (np.mean(len_text), np.std(len_text)) )
print()
timelines.rewind()
above_avg = [tw for tw in timelines
if 'len_text' in tw.keys() and tw['len_text'] > m_len_text]
print(" => %0.2f documents above average: " % len(above_avg) )
def followers_status(screen_name):
followers = db.followers.find_one({"screen_name": screen_name})
print("We have %s follower IDs for %s" %
(len(followers['ids']), screen_name))
def wait_for_awhile():
reset = int(twitter.get_lastfunction_header('x-rate-limit-reset'))
wait = max(reset - time.time(), 0) + 10
print("Rate limit exceeded waiting: %sm %0.0fs"%
(int(int( wait)/60),wait % 60 ))
time.sleep(wait)
print(__doc__)
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='>>> %(asctime)s %(levelname)s %(message)s')
# ---------------------------------------------------------
# parse commandline arguments
# ---------------------------------------------------------
op = OptionParser()
op.add_option("-s", "--screen_name",
dest="screen_name", type="string",
help="Screen name of the main account")
op.add_option("-f", "--followers",
action="store_true", dest="followers", default=False,
help="Extracts IDs of screen_name followers from Twitter")
op.add_option("-t", "--timelines",
action="store_true", dest="timelines", default=False,
help="Extracts timelines of the followers from Twitter")
op.add_option("-d", "--dbname", dest="dbname", default='twitter',
help="Name of the MongDB collection")
op.add_option("-n", "--n_followers", dest="n_followers", default='5000',
help="Number of follower IDs; 5000 at a time")
# Initialize
(opts, args) = op.parse_args()
print(opts)
screen_name = opts.screen_name.lower() # The main twitter account
n_days = 180 # Only tweets more recent than n_days are kept
n_followers = int(opts.n_followers)
# ---------------------------------------------------------
# Twitter Connection: credentials stored in twitter.cfg
# ---------------------------------------------------------
config = ConfigParser()
config.read('twitter.cfg')
# for py27 change config to get_config
APP_KEY = config['credentials']['app_key']
APP_SECRET = config['credentials']['app_secret']
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter.obtain_access_token()
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
# MongoDB connection
client = MongoClient()
db = client[opts.dbname]
# -----------------------------------------------------------
# Follower IDs
# -----------------------------------------------------------
if opts.followers:
followers_query_size = min(n_followers,5000) # Twitter default
print("Retrieving %s followers" % n_followers)
# ------------------------------------------------------------------
# 1) get follower ids
# see https://dev.twitter.com/rest/reference/get/followers/ids
# ------------------------------------------------------------------
# Initialize the database followers record for that screen_name
res = db.followers.find_one( {"screen_name": screen_name})
if res is None:
db.followers.insert_one( {"screen_name": screen_name, "ids": []} )
# cursor is used to navigate a twitter collection
# https://dev.twitter.com/overview/api/cursoring
next_cursor = -1
follower_ids = list()
ids_count = 0
while (next_cursor != 0) and ( ids_count < n_followers):
try:
print("Followers %s to %s: cursor: %s" %
(ids_count, ids_count + followers_query_size, next_cursor))
result = twitter.get_followers_ids(screen_name = screen_name,
count = followers_query_size,
cursor = next_cursor)
follower_ids = follower_ids + result['ids']
next_cursor = result['next_cursor']
ids_count += len(result['ids'])
# make sure the list only has unique IDs and sort
follower_ids = list(set(follower_ids))
follower_ids.sort()
print("Retrieved %s follower IDs from twitter" % len(follower_ids))
# store what we've got so far
# insert follower_ids in the followers collection
res = db.followers.update_one(
{"screen_name": screen_name},
{ '$set': {"ids": follower_ids} }
)
if res.matched_count == 0:
print("Unable to update IDs for: ",screen_name)
elif res.modified_count == 0:
print("%s IDs not modified"% screen_name)
else:
print("%s now has %s IDs " % (screen_name, str(len(follower_ids))) )
followers_status(screen_name)
except TwythonRateLimitError as e:
# Wait if we hit the Rate limit
followers_status(screen_name)
wait_for_awhile()
except:
print(" FAILED: Unexpected error:", sys.exc_info()[0])
pass
# followers_status(screen_name)
# -----------------------------------------------------------
# Timelines
# -----------------------------------------------------------
if opts.timelines:
# catch IDs that error out
error_ids = list()
# List of follower IDs
followers = db.followers.find_one({"screen_name": screen_name})
print("Retrieving timelines of %s followers" % len(followers['ids']))
# Get all timelines or limited to n_followers
if n_followers is None:
follower_ids = followers['ids']
else:
follower_ids = followers['ids'][0:n_followers-1]
for id in follower_ids:
try:
# get the tweets for that account's timeline
params = {'user_id': id, 'count': 200,
'contributor_details': 'true' }
timeline = twitter.get_user_timeline(**params)
# keep only recent_tweets
recent_tweets = [twt for twt in timeline if is_recent(twt)]
# Aggregate the tweets to create the document
text = ' '.join( [tw['text'] for tw in recent_tweets])
item = {
'raw_text': text,
'user_id': id,
'len_text': len(text),
'n_tweets': len(recent_tweets),
'screen_name': timeline[0]['user']['screen_name'],
'lang': timeline[0]['lang'],
'parent': screen_name,
}
# do we already have this account in the db?
twt = db.tweets.find({'user_id':id, 'parent': screen_name})
# if we do, update the data else create a new entry
if twt.count() == 0:
# store document
print("New account:",timeline[0]['user']['screen_name'],
id,len(recent_tweets), timeline[0]['lang'])
db.tweets.insert_one(item)
else:
# update the existing account record
res = db.tweets.replace_one(
{'user_id':id, 'parent': screen_name}, item
)
# result of the update
if res.matched_count == 0:
print("no match for id: ",id)
elif res.modified_count == 0:
print("no modification for id: ",id)
else:
print("replaced ",timeline[0]['user']['screen_name'],
id,len(recent_tweets), timeline[0]['lang'] )
except TwythonRateLimitError as e:
# Wait if we hit the Rate limit
corpus_status(screen_name)
wait_for_awhile()
except:
# Keep track of the ID that errored out
error_ids.append(id)
print(" FAILED:", id)
print("Unexpected error:", sys.exc_info()[0])
pass
# ---------------------------------------------------------
# check how many documents we now have in the Database
# ---------------------------------------------------------
print("The following IDs errored out:", str(error_ids))
corpus_status(screen_name)