/
gop.py
152 lines (116 loc) · 4.38 KB
/
gop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#for saving objects to files and reading them back in
import pickle
#for retrieving and slicing the twitter stream
import simplejson
from requests_oauthlib import OAuth1
from itertools import islice, chain
import requests
#for natural language processing
from nltk.corpus import stopwords
#for statistics on tweeted words
from collections import Counter
import heapq
#for regular expressions
import re
#for scheduling at a given time
import datetime
"""Follow the twitter stream in a given time window and extract
tweets with keywords for word statistics
"""
#I/O of python objects via pickle files
def save_obj(obj, name ):
with open('obj/'+ name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open('obj/' + name + '.pkl', 'rb') as f:
return pickle.load(f)
#access the twitter stream with this function
def tweet_generator():
stream = requests.post('https://stream.twitter.com/1.1/statuses/filter.json',
auth=auth,
stream=True,
data={"locations" : US_BOUNDING_BOX, #set API filters (see twitter website)
"language" : "en",
"filter_level": "none"})
for line in stream.iter_lines():
if not line: # filter out keep-alive new lines
continue
tweet = simplejson.loads(line)
if 'text' in tweet:
yield tweet['text']
#get the n largest elements in an array
def nlargest(n, word_scores):
return heapq.nlargest(n, word_scores, key=lambda x: x[1])
#initialize the twitter stream
with open("twitter_secrets.json.nogit") as fh:
secrets = simplejson.loads(fh.read())
auth = OAuth1(
secrets["api_key"],
secrets["api_secret"],
secrets["access_token"],
secrets["access_token_secret"]
)
#Pacific west of Mexico to Atlantic east of Canada
US_BOUNDING_BOX = "-125.00,24.94,-66.93,49.59"
#use filter words
stop = set(stopwords.words('english'))
#initialize the dictionaries
counter_1 = Counter()
counter_2 = Counter()
counter_3 = Counter()
counter_4 = Counter()
#define keywords
search_word_1 = "trump"
search_word_2 = "rubio"
search_word_3 = "cruz"
search_word_4 = "kasich"
#create dummy word list
safe_words = ['abwasserversorgung', 'anhalter', 'gruppendynamik', 'altersheim', 'lilaloe']
for word in safe_words:
counter_1[word] += 1
counter_2[word] += 1
counter_3[word] += 1
counter_4[word] += 1
#initialize counter for tries
k = 0
#loop for reading the stream
while True:
#set stop time
if datetime.datetime.now() > datetime.datetime(2016, 3, 4, 6, 00, 0):
break
#set start time
elif datetime.datetime.now() < datetime.datetime(2016, 3, 4, 3, 00, 0):
pass
#try taking data
else:
print(k, datetime.datetime.now())
try:
for tweet in islice(tweet_generator(), 1000):
words = re.sub(r'[.,_@#!?&:;"/+-]', r' ', tweet.lower())
if search_word_1 in words.split(): #check here for keyword
for word in words.split():
if word not in stop:
counter_1[word] += 1
if search_word_2 in words.split(): #check here for keyword
for word in words.split():
if word not in stop:
counter_2[word] += 1
if search_word_3 in words.split(): #check here for keyword
for word in words.split():
if word not in stop:
counter_3[word] += 1
if search_word_4 in words.split(): #check here for keyword
for word in words.split():
if word not in stop:
counter_4[word] += 1
#save 1000 most often used words in files
save_obj(nlargest(1000, counter_1.items()), search_word_1)
save_obj(nlargest(1000, counter_2.items()), search_word_2)
save_obj(nlargest(1000, counter_3.items()), search_word_3)
save_obj(nlargest(1000, counter_4.items()), search_word_4)
print("\n" + str(nlargest(10, counter_1.items())) + "\n")
k = k + 1
except:
pass