Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
datatalks/twitter/twitter_data.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
273 lines (229 sloc)
10.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Alexis Perrier <alexis.perrier@gmail.com> | |
# License: BSD 3 clause | |
# Python 3 | |
''' | |
Gets data from Twitter. | |
Requires Twython and pymongo (and mongo Db running) | |
Python 3.4 | |
Usage: | |
Get follower IDs for an account that has 5000 followers | |
python twitter.py --followers --screen_name berkleecollege --n_followers=5000 | |
Get timelines of followers (limit to 100 TLs): | |
python twitter.py --timelines --screen_name berkleecollege --n_followers=100 | |
The script assumes the account name exists | |
''' | |
from __future__ import print_function | |
import logging | |
import numpy as np | |
import sys | |
from optparse import OptionParser | |
from configparser import ConfigParser | |
import time | |
from pymongo import MongoClient | |
from twython import Twython, TwythonRateLimitError | |
from dateutil import parser | |
import datetime as dt | |
from datetime import datetime | |
def is_recent(twt): | |
'''Checks that the tweet is more recent than n_days''' | |
return parser.parse(twt['created_at']).replace(tzinfo=None) > \ | |
(dt.datetime.today() - dt.timedelta(days=n_days)) | |
def corpus_status(screen_name): | |
''' State of the stored corpus: number of documents, average length | |
and number of tweets per Timeline''' | |
print("\n-------- Corpus --------") | |
timelines = db.tweets.find({'parent':screen_name}) | |
documents = [tw['raw_text'] for tw in timelines] | |
print(" We have " + str(len(documents)) + " documents ") | |
timelines.rewind() | |
n_tweets = sum([tw['n_tweets'] for tw in timelines | |
if 'n_tweets' in tw.keys() and tw['n_tweets'] >0 ]) | |
print() | |
print(" Total number of tweets: ", n_tweets) | |
print(" On average #tweets per document: %0.2f" % | |
(n_tweets / len(documents))) | |
timelines.rewind() | |
len_text = [tw['len_text'] for tw in timelines | |
if 'len_text' in tw.keys() and tw['len_text'] > 0] | |
m_len_text = np.mean(len_text) | |
print(" Text length: Mean: %0.2f STD: %0.2f" | |
% (np.mean(len_text), np.std(len_text)) ) | |
print() | |
timelines.rewind() | |
above_avg = [tw for tw in timelines | |
if 'len_text' in tw.keys() and tw['len_text'] > m_len_text] | |
print(" => %0.2f documents above average: " % len(above_avg) ) | |
def followers_status(screen_name): | |
followers = db.followers.find_one({"screen_name": screen_name}) | |
print("We have %s follower IDs for %s" % | |
(len(followers['ids']), screen_name)) | |
def wait_for_awhile(): | |
reset = int(twitter.get_lastfunction_header('x-rate-limit-reset')) | |
wait = max(reset - time.time(), 0) + 10 | |
print("Rate limit exceeded waiting: %sm %0.0fs"% | |
(int(int( wait)/60),wait % 60 )) | |
time.sleep(wait) | |
print(__doc__) | |
# Display progress logs on stdout | |
logging.basicConfig(level=logging.INFO, | |
format='>>> %(asctime)s %(levelname)s %(message)s') | |
# --------------------------------------------------------- | |
# parse commandline arguments | |
# --------------------------------------------------------- | |
op = OptionParser() | |
op.add_option("-s", "--screen_name", | |
dest="screen_name", type="string", | |
help="Screen name of the main account") | |
op.add_option("-f", "--followers", | |
action="store_true", dest="followers", default=False, | |
help="Extracts IDs of screen_name followers from Twitter") | |
op.add_option("-t", "--timelines", | |
action="store_true", dest="timelines", default=False, | |
help="Extracts timelines of the followers from Twitter") | |
op.add_option("-d", "--dbname", dest="dbname", default='twitter', | |
help="Name of the MongDB database") | |
op.add_option("-n", "--n_followers", dest="n_followers", default='5000', | |
help="Number of follower IDs; 5000 at a time") | |
# Initialize | |
(opts, args) = op.parse_args() | |
print(opts) | |
screen_name = opts.screen_name.lower() # The main twitter account | |
n_days = 180 # Only tweets more recent than n_days are kept | |
n_followers = int(opts.n_followers) | |
# --------------------------------------------------------- | |
# Twitter Connection: credentials stored in twitter.cfg | |
# --------------------------------------------------------- | |
config = ConfigParser() | |
config.read('twitter.cfg') | |
# for py27 change config to get_config | |
APP_KEY = config['credentials']['app_key'] | |
APP_SECRET = config['credentials']['app_secret'] | |
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) | |
ACCESS_TOKEN = twitter.obtain_access_token() | |
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) | |
# MongoDB connection | |
client = MongoClient() | |
db = client[opts.dbname] | |
# ----------------------------------------------------------- | |
# Follower IDs | |
# ----------------------------------------------------------- | |
if opts.followers: | |
followers_query_size = min(n_followers,5000) # Twitter default | |
print("Retrieving %s followers" % n_followers) | |
# ------------------------------------------------------------------ | |
# 1) get follower ids | |
# see https://dev.twitter.com/rest/reference/get/followers/ids | |
# ------------------------------------------------------------------ | |
# Initialize the database followers record for that screen_name | |
res = db.followers.find_one( {"screen_name": screen_name}) | |
if res is None: | |
db.followers.insert_one( {"screen_name": screen_name, "ids": []} ) | |
# cursor is used to navigate a twitter collection | |
# https://dev.twitter.com/overview/api/cursoring | |
next_cursor = -1 | |
follower_ids = list() | |
ids_count = 0 | |
while (next_cursor != 0) and ( ids_count < n_followers): | |
try: | |
print("Followers %s to %s: cursor: %s" % | |
(ids_count, ids_count + followers_query_size, next_cursor)) | |
result = twitter.get_followers_ids(screen_name = screen_name, | |
count = followers_query_size, | |
cursor = next_cursor) | |
follower_ids = follower_ids + result['ids'] | |
next_cursor = result['next_cursor'] | |
ids_count += len(result['ids']) | |
# make sure the list only has unique IDs and sort | |
follower_ids = list(set(follower_ids)) | |
follower_ids.sort() | |
print("Retrieved %s follower IDs from twitter" % len(follower_ids)) | |
# store what we've got so far | |
# insert follower_ids in the followers collection | |
res = db.followers.update_one( | |
{"screen_name": screen_name}, | |
{ '$set': {"ids": follower_ids} } | |
) | |
if res.matched_count == 0: | |
print("Unable to update IDs for: ",screen_name) | |
elif res.modified_count == 0: | |
print("%s IDs not modified"% screen_name) | |
else: | |
print("%s now has %s IDs " % (screen_name, str(len(follower_ids))) ) | |
followers_status(screen_name) | |
except TwythonRateLimitError as e: | |
# Wait if we hit the Rate limit | |
followers_status(screen_name) | |
wait_for_awhile() | |
except: | |
print(" FAILED: Unexpected error:", sys.exc_info()[0]) | |
pass | |
# followers_status(screen_name) | |
# ----------------------------------------------------------- | |
# Timelines | |
# ----------------------------------------------------------- | |
if opts.timelines: | |
# catch IDs that error out | |
error_ids = list() | |
# List of follower IDs | |
followers = db.followers.find_one({"screen_name": screen_name}) | |
print("Retrieving timelines of %s followers" % len(followers['ids'])) | |
# Get all timelines or limited to n_followers | |
if n_followers is None: | |
follower_ids = followers['ids'] | |
else: | |
follower_ids = followers['ids'][0:n_followers-1] | |
for id in follower_ids: | |
try: | |
# get the tweets for that account's timeline | |
params = {'user_id': id, 'count': 200, | |
'contributor_details': 'true' } | |
timeline = twitter.get_user_timeline(**params) | |
# keep only recent_tweets | |
recent_tweets = [twt for twt in timeline if is_recent(twt)] | |
# Aggregate the tweets to create the document | |
text = ' '.join( [tw['text'] for tw in recent_tweets]) | |
item = { | |
'raw_text': text, | |
'user_id': id, | |
'len_text': len(text), | |
'n_tweets': len(recent_tweets), | |
'screen_name': timeline[0]['user']['screen_name'], | |
'lang': timeline[0]['lang'], | |
'parent': screen_name, | |
} | |
# do we already have this account in the db? | |
twt = db.tweets.find({'user_id':id, 'parent': screen_name}) | |
# if we do, update the data else create a new entry | |
if twt.count() == 0: | |
# store document | |
print("New account:",timeline[0]['user']['screen_name'], | |
id,len(recent_tweets), timeline[0]['lang']) | |
db.tweets.insert_one(item) | |
else: | |
# update the existing account record | |
res = db.tweets.replace_one( | |
{'user_id':id, 'parent': screen_name}, item | |
) | |
# result of the update | |
if res.matched_count == 0: | |
print("no match for id: ",id) | |
elif res.modified_count == 0: | |
print("no modification for id: ",id) | |
else: | |
print("replaced ",timeline[0]['user']['screen_name'], | |
id,len(recent_tweets), timeline[0]['lang'] ) | |
except TwythonRateLimitError as e: | |
# Wait if we hit the Rate limit | |
corpus_status(screen_name) | |
wait_for_awhile() | |
except: | |
# Keep track of the ID that errored out | |
error_ids.append(id) | |
print(" FAILED:", id) | |
print("Unexpected error:", sys.exc_info()[0]) | |
pass | |
# --------------------------------------------------------- | |
# check how many documents we now have in the Database | |
# --------------------------------------------------------- | |
print("The following IDs errored out:", str(error_ids)) | |
corpus_status(screen_name) | |