Find trending topics on twitter for a list of locations and save to csv

In [1]:
import configparser
from TwitterAPI import TwitterAPI
import sys
import time
#import yagmail
from datetime import datetime
from collections import Counter

In [2]:
def get_twitter(config_file):
    """ Read the config_file and construct an instance of TwitterAPI.
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = configparser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

def robust_request(twitter, resource, params, max_tries=5):
    """ If a Twitter request fails, sleep for 15 minutes.
    Do this at most max_tries times before quitting.
    Args:
      twitter .... A TwitterAPI object.
      resource ... A resource string to request.
      params ..... A parameter dictionary for the request.
      max_tries .. The maximum number of tries to attempt.
    Returns:
      A TwitterResponse object, or None if failed.
    """
    for i in range(max_tries):
        request = twitter.request(resource, params)
        if request.status_code == 200:
            return request
        else:
            print('Got error:', request.text, '\nsleeping for 15 minutes.', file=sys.stderr)
            sys.stderr.flush()
            time.sleep(61 * 15)

def find_trends(twitter, location):
    topics = robust_request(twitter, 'trends/place', {'id': location}, 20)
    trends = []
    for t in topics:
        topic = "%d\t%s\t%s\t%s\t%s\t%s\n" %(location, t['name'], t['url'], str(t['tweet_volume']), t['promoted_content'], t['query'])
        trends.append(topic)
    return trends

def find_place_ids(twitter):
    places = robust_request(twitter, 'trends/available',{}, 20)
    place_ids = []
    for p in places:
        place_ids.append(p['woeid'])
    return place_ids

def find_places(twitter):
    places = robust_request(twitter, 'trends/available',{}, 20)
    all_places = []
    for p in places:
        all_places.append(p)
    return all_places

def extract_topics(infile, outfile, keyword):
    topics = []
    with open(infile, 'r') as tsv_infile:
        for line in tsv_infile:
            row = line.split()
            if keyword in row[2]:
                topics.append(row)
    
    with open(outfile, 'w') as tsv_outfile:
        tsv_outfile.write('Location Name\tWOE ID\tName\tURL\tEvents\tPromoted?\tQuery\n')
        for topic in topics:
            row = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(topic[0], topic[1], topic[2], topic[3], topic[4], topic[5], topic[6])
            tsv_outfile.write(row)

    print('topics filtered.')

def count_topics(filename):
    all_topics = []
    with open(filename, 'r') as topics:
        for line in topics:
            row = line.split('\t')
            all_topics.append(row[2])

    topic_count = Counter(all_topics)
    return topic_count

def email_file(config, filename):
    from_addr = config.get('email', 'from')
    passwrd = config.get('email', 'pass')
    to_addr = config.get('email', 'to')

    contents = ['See attached.', filename]

    yag = yagmail.SMTP(from_addr, passwrd)
    yag.send(to_addr, filename, contents)
    print('email sent.')

def get_datestring():
    today = datetime.today()
    year, month, day = today.year, today.month, today.day

    if len(str(month)) < 2:
        month = "0%d" %(month)
    else:
        month = "%d" %(month)
    
    if len(str(day)) < 2:
        day = "0%d" %(day)
    else:
        day = "%d" %(day)

    datestring = "%d-%s-%s" %(year, month, day)
    return datestring

def get_trending_topics(filename):
    with open(all_topics, 'w') as tsv_file:
        tsv_file.write('Location Name\tWOE ID\tName\tURL\tEvents\tPromoted?\tQuery\n')

    # iterate through all twitter locations 
    # store trending topics for each location
    for pid in place_ids:
        try:
            trends = find_trends(twitter, pid)
        
            for p in places:
                if p['woeid'] == pid:
                    name = p['name']
            with open(all_topics, 'a') as tsv_file:
                for topic in trends:
                    tsv_file.write(name+'\t'+ topic)
        except (Timeout, ssl.SSLError, ReadTimeoutError, ConnectionError) as exc:
            print("error: %s" % exc)
            sleep(60*5)
            
def get_top_topics(filename):
    topic_counter = count_topics(filename)
    top_topics = []
    with open(filename, 'r') as topics:
        for line in topics:
            row = line.split('\t')
            loc, woe, name, events, promoted = row[0], row[1], row[2], row[4], row[5]
            count = topic_counter[name]
            top_topics.append((loc, woe, name, events, promoted, count))
    sorted_topics = sorted(top_topics, key=lambda x: (x[5], x[2]), reverse=True)
    top_filename = "top-" + filename
    with open(top_filename, 'w') as tsv_file:
        tsv_file.write('Location Name\tWOE ID\tName\tEvents\tPromoted?\tCount\n')
    
        for topic in sorted_topics:
            if topic[5] > 1:
                row = "%s\t%s\t%s\t%s\t%s\t%s\n" %(topic[0], topic[1], topic[2], topic[3], topic[4], topic[5])
                tsv_file.write(row)



In [3]:
config = configparser.ConfigParser()
config.read('settings.cfg')

filter_term = config.get('files', 'filter_term')
prefix = config.get('files', 'prefix')
twitter = get_twitter('settings.cfg')

place_ids = find_place_ids(twitter)
places = find_places(twitter)

datestring = get_datestring()

all_topics = prefix + '-' + datestring + '.csv'
filtered_topics = prefix + '-' + filter_term + '-' + datestring + '.csv'

In [19]:
topic_counter = count_topics(all_topics)
#print(topic_counter.most_common(100))

FileNotFoundError: [Errno 2] No such file or directory: 'trending-topics-2017-02-08.csv'

In [5]:
len(topic_counter)

2985

In [7]:
filename = 'trending-topics-2017-02-08.csv'
top_topics = []
with open(filename, 'r') as topics:
    for line in topics:
        row = line.split('\t')
        loc, woe, name, events, promoted = row[0], row[1], row[2], row[4], row[5]
        count = topic_counter[name]
        #print(count)
        top_topics.append((loc, woe, name, events, promoted, count))

In [16]:
sorted_topics = sorted(top_topics, key=lambda x: (x[5], x[2]), reverse=True)
#print(sorted_topics)

In [18]:
top_filename = 'top-trending-topics-2017-02-08.csv'

with open(top_filename, 'w') as tsv_file:
    tsv_file.write('Location Name\tWOE ID\tName\tEvents\tPromoted?\tCount\n')
    
    for topic in sorted_topics:
        if topic[5] > 1:
            row = "%s\t%s\t%s\t%s\t%s\t%s\n" %(topic[0], topic[1], topic[2], topic[3], topic[4], topic[5])
            tsv_file.write(row)
    

In [None]:
get_trending_topics(all_topics)
extract_topics(all_topics, filtered_topics, filter_term)
email_file(config, filtered_topics)

In [3]:
twitter = get_twitter('settings.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [4]:
config = configparser.ConfigParser()
config.read('settings.cfg')
all_topics = config.get('files', 'all_topics')
filtered_topics = config.get('files', 'all_topics')
filter_term = config.get('files', 'filter_term')

place_ids = find_place_ids(twitter)
places = find_places(twitter)




In [7]:
with open(all_topics, 'w') as tsv_file:
    tsv_file.write('Location Name\tWOE ID\tName\tURL\tEvents\tPromoted?\tQuery\n')

for pid in place_ids:
    try:
        trends = find_trends(twitter, pid)
        
        for p in places:
            if p['woeid'] == pid:
                name = p['name']
        with open('all_topics', 'a') as tsv_file:
            #print(trends[0])
            for topic in trends:
                tsv_file.write(name+'\t'+ topic)
    except (Timeout, ssl.SSLError, ReadTimeoutError, ConnectionError) as exc:
        print("error: %s" % exc)
        sleep(60*5)

Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.
Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.
Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.
Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.
Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.
Got error: {"errors":[{"message":"Rate limit exceeded","code":88}]} 
sleeping for 15 minutes.


Started at 8:31 PM. Failed, then restarted at 8:33 PM. Failed at 8:55 & restarted.

Started at 10:58. Reached rate limit at 11:00
Started at 12:49am. Failed at 3:10am (almost done)
Started at 4:24am

Started at 3PM 1/7. Done before 5PM (after 4:30 I think).

In [None]:
filtered = []
with open(filtered_topics, 'r') as tsv_file:
    for line in tsv_file:
        row = line.split()
        if filter_term in row[2]:
            filtered.append(row)
        

In [None]:
def extract_topics(infile, outfile, keyword):
    topics = []
    with open(infile, 'r') as tsv_file:
        for line in tsv_file:
            row = line.split()
            if keyword in row[2]:
                topics.append(row)
    return topics
                

In [None]:
len(topics_17)

In [None]:
with open(filtered, 'w') as tsv_file:
    for topic in topics_17:
        row = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(topic[0], topic[1], topic[2], topic[3], topic[4], topic[5], topic[6])
        tsv_file.write(row)

In [16]:
from datetime import datetime

today = datetime.today()
today

datetime.datetime(2017, 2, 3, 11, 31, 14, 575534)

In [17]:
year, month, day = today.year, today.month, today.day

In [18]:
year

2017

In [19]:
month

2

In [20]:
day

3

In [21]:
if len(str(month)) < 2:
    month = "0%d" %(month)
else:
    month = "%d" %(month)
    
if len(str(day)) < 2:
    day = "0%d" %(day)
else:
    day = "%d" %(day)

In [23]:
datestring = "%d-%s-%s" %(year, month, day)
datestring

'2017-02-03'

In [5]:
config = configparser.ConfigParser()
config.read('settings.cfg')

email_addresses = config.get('email', 'to')
email_addresses

'marc.smith.email@gmail.com'

In [6]:
email_list = email_addresses.split(',')

for address in email_list:
    print(address)

marc.smith.email@gmail.com
