In [1]:
import twitter
import urlparse
from pprint import pprint as pp
import logging
from IO_json import IO_json
from IO_csv import IO_csv
from IO_mongo import IO_mongo
from collections import namedtuple
from IPython.core.debugger import Tracer

class TwitterAPI(object):
    """
    TwitterAPI class allows the Connection to Twitter via OAuth
    once you have registered with Twitter and receive the 
    necessary credentials
    """
    
    # initialize and get the twitter credentials
    def __init__(self):
        consumer_key = ''
        consumer_secret = ''
        access_token = ''
        access_secret = ''
        
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_secret = access_secret
        
        self.retries = 3

        # authenticate credentials with Twitter us Oauth 
        self.auth = twitter.oauth.OAuth(access_token, access_secret, consumer_key, consumer_secret)
        # creates registered Twitter API
        self.api = twitter.Twitter(auth=self.auth)

        # logger initialisation
        appName = 'twt150530'
        self.logger = logging.getLogger(appName)
        # create console handler and set level to debug
        logPath = '/home/carl/spark/examples/carl_Spark/data'
        fileName = appName
        fileHander = logging.FileHandler("{0}/{1}.log".format(logPath, fileName))
        formatter = logging.Formatter(' %(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fileHander.setFormatter(formatter)
        self.logger.addHandler(fileHander)
        self.logger.setLevel(logging.DEBUG)

        # save to JSON file initialisation
        jsonFpath = '/home/carl/spark/examples/carl_Spark/data'
        jsonFname = 'twtr15053001'
        self.jsonSaver = IO_json(jsonFpath, jsonFname)

        # save to csv file initialisation
        csvFpath = '/home/carl/spark/examples/carl_Spark/data'
        csvFname = 'twtr15053001'
        self.csvSaver = IO_csv(csvFpath, csvFname)
        
        # save to MongoDB Intitialisation
        self.mongoSaver = IO_mongo(db='twtr01_db', coll='twtr01_coll')

    
    # search Twitter with query q and max result
    def searchTwitter(self, q, max_res=10, **kwargs):
        search_results = self.api.search.tweets(q=q, count=10, **kwargs)
        statuses = search_results['statuses']
        max_results  = min(1000, max_res)
        
        for _ in range(10):
            try:
                next_results = search_results['search_metadata']['next_results']
                
            except KeyError as e:
                self.logger.error('error in searchTwitter: %s' %(e))
                break
                
            next_results = urlparse.parse_qsl(next_results[1:])
            kwargs = dict(next_results)
            search_results = self.api.search.tweets(**kwargs)
            statuses += search_results['statuses']
            self.saveTweets(search_results['statuses'])

            if len(statuses) > max_results:
                self.logger.info('info in searchTwitter - got %i tweets - max: %i' %(len(statuses), max_results))
                break
                
        return statuses
                
    def saveTweets(self, statuses):
        # saving to JSON file
        self.jsonSaver.save(statuses)
        
        # saving to csv file
        fields01= ['id', 'created_at', 'user_id', 'user_name', 'tweet_text', 'url']
        # Tweet01 = namedtuple('Tweet01', fields01)
        # self.csvSaver.save(statuses, 'Tweet01', fields01)
        # Tracer()()
        self.csvSaver.save(self.parseTweets(statuses), 'Tweet01', fields01)
        
        # saving to MongoDB
        for s in statuses:
            self.mongoSaver.save(s)

    def getTweets(self, q, max_res=10):
        """
        Make a Twitter API call whilst managing rate limit and errors.
        """
        def handleError(e, wait_period=2, slepp_when_rate_limited=True):
            if wait_period > 3600: # Seconds
                self.logger.error('Too many retries in getTweets: %s' % (e))
                raise e
            if e.e.code == 401:
                self.logger.error('error 401 * Not Authorised * in getTweets: %s' % (e))
                return None
            elif e.e.code == 404:
                self.logger.error('error 404 * Not Found * in getTweets: %s' % (e))
                return None
            elif e.e.code == 429:
                self.logger.error('error 429 * API Rate Limit Exceeded * in getTweets: %s' % (e))
                if slepp_when_rate_limited:
                    self.logger.error('error 429 * Retrying in 15 minutes * in getTweets: %s' % (e))
                    sys.stderr.flush()
                    time.sleep(60*15 + 5)
                    self.logger.info('error 429 * Retrying now * in getTweets: %s' % (e))
                    return 2
                else:
                    raise e # Caller must handle the rate limiting issue
            elif e.e.code in (500, 502, 503, 504):
                self.logger.info('Encountered %i Error. Retrying in %i seconds' % (e.e.code, wait_period))
                time.sleep(wait_period)
                wait_period *= 1.5
                return wait_period
            else:
                self.logger.error('Exit - aborting - %s' % (e))
                raise e

        try:
            self.searchTwitter(q, max_res=10)
        except twitter.api.TwitterHTTPError as e:
            error_count = 0
            wait_period = handleError(e, wait_period)
            if wait_period is None:
                return


    # parse tweets as it is collected to extract id, creation date, user id, tweet text
    def parseTweets(self, statuses):
        return [(status['id'],
                 status['created_at'],
                 status['user']['id'],
                 status['user']['name'],
                 status['text'].encode('utf-8'), url['expanded_url'])
                    for status in statuses
                        for url in status['entities']['urls']
        ]
