-
Notifications
You must be signed in to change notification settings - Fork 35
/
twitter_data.py
273 lines (229 loc) · 10.2 KB
/
twitter_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# Author: Alexis Perrier <alexis.perrier@gmail.com>
# License: BSD 3 clause
# Python 3
'''
Gets data from Twitter.
Requires Twython and pymongo (and mongo Db running)
Python 3.4
Usage:
Get follower IDs for an account that has 5000 followers
python twitter.py --followers --screen_name berkleecollege --n_followers=5000
Get timelines of followers (limit to 100 TLs):
python twitter.py --timelines --screen_name berkleecollege --n_followers=100
The script assumes the account name exists
'''
from __future__ import print_function
import logging
import numpy as np
import sys
from optparse import OptionParser
from configparser import ConfigParser
import time
from pymongo import MongoClient
from twython import Twython, TwythonRateLimitError
from dateutil import parser
import datetime as dt
from datetime import datetime
def is_recent(twt):
'''Checks that the tweet is more recent than n_days'''
return parser.parse(twt['created_at']).replace(tzinfo=None) > \
(dt.datetime.today() - dt.timedelta(days=n_days))
def corpus_status(screen_name):
''' State of the stored corpus: number of documents, average length
and number of tweets per Timeline'''
print("\n-------- Corpus --------")
timelines = db.tweets.find({'parent':screen_name})
documents = [tw['raw_text'] for tw in timelines]
print(" We have " + str(len(documents)) + " documents ")
timelines.rewind()
n_tweets = sum([tw['n_tweets'] for tw in timelines
if 'n_tweets' in tw.keys() and tw['n_tweets'] >0 ])
print()
print(" Total number of tweets: ", n_tweets)
print(" On average #tweets per document: %0.2f" %
(n_tweets / len(documents)))
timelines.rewind()
len_text = [tw['len_text'] for tw in timelines
if 'len_text' in tw.keys() and tw['len_text'] > 0]
m_len_text = np.mean(len_text)
print(" Text length: Mean: %0.2f STD: %0.2f"
% (np.mean(len_text), np.std(len_text)) )
print()
timelines.rewind()
above_avg = [tw for tw in timelines
if 'len_text' in tw.keys() and tw['len_text'] > m_len_text]
print(" => %0.2f documents above average: " % len(above_avg) )
def followers_status(screen_name):
followers = db.followers.find_one({"screen_name": screen_name})
print("We have %s follower IDs for %s" %
(len(followers['ids']), screen_name))
def wait_for_awhile():
reset = int(twitter.get_lastfunction_header('x-rate-limit-reset'))
wait = max(reset - time.time(), 0) + 10
print("Rate limit exceeded waiting: %sm %0.0fs"%
(int(int( wait)/60),wait % 60 ))
time.sleep(wait)
print(__doc__)
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='>>> %(asctime)s %(levelname)s %(message)s')
# ---------------------------------------------------------
# parse commandline arguments
# ---------------------------------------------------------
op = OptionParser()
op.add_option("-s", "--screen_name",
dest="screen_name", type="string",
help="Screen name of the main account")
op.add_option("-f", "--followers",
action="store_true", dest="followers", default=False,
help="Extracts IDs of screen_name followers from Twitter")
op.add_option("-t", "--timelines",
action="store_true", dest="timelines", default=False,
help="Extracts timelines of the followers from Twitter")
op.add_option("-d", "--dbname", dest="dbname", default='twitter',
help="Name of the MongDB database")
op.add_option("-n", "--n_followers", dest="n_followers", default='5000',
help="Number of follower IDs; 5000 at a time")
# Initialize
(opts, args) = op.parse_args()
print(opts)
screen_name = opts.screen_name.lower() # The main twitter account
n_days = 180 # Only tweets more recent than n_days are kept
n_followers = int(opts.n_followers)
# ---------------------------------------------------------
# Twitter Connection: credentials stored in twitter.cfg
# ---------------------------------------------------------
config = ConfigParser()
config.read('twitter.cfg')
# for py27 change config to get_config
APP_KEY = config['credentials']['app_key']
APP_SECRET = config['credentials']['app_secret']
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
ACCESS_TOKEN = twitter.obtain_access_token()
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
# MongoDB connection
client = MongoClient()
db = client[opts.dbname]
# -----------------------------------------------------------
# Follower IDs
# -----------------------------------------------------------
if opts.followers:
followers_query_size = min(n_followers,5000) # Twitter default
print("Retrieving %s followers" % n_followers)
# ------------------------------------------------------------------
# 1) get follower ids
# see https://dev.twitter.com/rest/reference/get/followers/ids
# ------------------------------------------------------------------
# Initialize the database followers record for that screen_name
res = db.followers.find_one( {"screen_name": screen_name})
if res is None:
db.followers.insert_one( {"screen_name": screen_name, "ids": []} )
# cursor is used to navigate a twitter collection
# https://dev.twitter.com/overview/api/cursoring
next_cursor = -1
follower_ids = list()
ids_count = 0
while (next_cursor != 0) and ( ids_count < n_followers):
try:
print("Followers %s to %s: cursor: %s" %
(ids_count, ids_count + followers_query_size, next_cursor))
result = twitter.get_followers_ids(screen_name = screen_name,
count = followers_query_size,
cursor = next_cursor)
follower_ids = follower_ids + result['ids']
next_cursor = result['next_cursor']
ids_count += len(result['ids'])
# make sure the list only has unique IDs and sort
follower_ids = list(set(follower_ids))
follower_ids.sort()
print("Retrieved %s follower IDs from twitter" % len(follower_ids))
# store what we've got so far
# insert follower_ids in the followers collection
res = db.followers.update_one(
{"screen_name": screen_name},
{ '$set': {"ids": follower_ids} }
)
if res.matched_count == 0:
print("Unable to update IDs for: ",screen_name)
elif res.modified_count == 0:
print("%s IDs not modified"% screen_name)
else:
print("%s now has %s IDs " % (screen_name, str(len(follower_ids))) )
followers_status(screen_name)
except TwythonRateLimitError as e:
# Wait if we hit the Rate limit
followers_status(screen_name)
wait_for_awhile()
except:
print(" FAILED: Unexpected error:", sys.exc_info()[0])
pass
# followers_status(screen_name)
# -----------------------------------------------------------
# Timelines
# -----------------------------------------------------------
if opts.timelines:
# catch IDs that error out
error_ids = list()
# List of follower IDs
followers = db.followers.find_one({"screen_name": screen_name})
print("Retrieving timelines of %s followers" % len(followers['ids']))
# Get all timelines or limited to n_followers
if n_followers is None:
follower_ids = followers['ids']
else:
follower_ids = followers['ids'][0:n_followers-1]
for id in follower_ids:
try:
# get the tweets for that account's timeline
params = {'user_id': id, 'count': 200,
'contributor_details': 'true' }
timeline = twitter.get_user_timeline(**params)
# keep only recent_tweets
recent_tweets = [twt for twt in timeline if is_recent(twt)]
# Aggregate the tweets to create the document
text = ' '.join( [tw['text'] for tw in recent_tweets])
item = {
'raw_text': text,
'user_id': id,
'len_text': len(text),
'n_tweets': len(recent_tweets),
'screen_name': timeline[0]['user']['screen_name'],
'lang': timeline[0]['lang'],
'parent': screen_name,
}
# do we already have this account in the db?
twt = db.tweets.find({'user_id':id, 'parent': screen_name})
# if we do, update the data else create a new entry
if twt.count() == 0:
# store document
print("New account:",timeline[0]['user']['screen_name'],
id,len(recent_tweets), timeline[0]['lang'])
db.tweets.insert_one(item)
else:
# update the existing account record
res = db.tweets.replace_one(
{'user_id':id, 'parent': screen_name}, item
)
# result of the update
if res.matched_count == 0:
print("no match for id: ",id)
elif res.modified_count == 0:
print("no modification for id: ",id)
else:
print("replaced ",timeline[0]['user']['screen_name'],
id,len(recent_tweets), timeline[0]['lang'] )
except TwythonRateLimitError as e:
# Wait if we hit the Rate limit
corpus_status(screen_name)
wait_for_awhile()
except:
# Keep track of the ID that errored out
error_ids.append(id)
print(" FAILED:", id)
print("Unexpected error:", sys.exc_info()[0])
pass
# ---------------------------------------------------------
# check how many documents we now have in the Database
# ---------------------------------------------------------
print("The following IDs errored out:", str(error_ids))
corpus_status(screen_name)