In [None]:
# coding:utf-8
from requests_oauthlib import OAuth1Session
import json
import datetime, time, sys
from abc import ABCMeta, abstractmethod
import pandas as pd
from pandas import Series, DataFrame
from dateutil.parser import parse


# ↓Twitterの認証情報を入力する
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""


# TweetsGetter
class TweetsGetter(object):
    __metaclass__ = ABCMeta
 
    def __init__(self):
        self.session = OAuth1Session(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
 
    @abstractmethod
    def specifyUrlAndParams(self, keyword):
        '''
        '''

 
    @abstractmethod
    def pickupTweet(self, res_text, includeRetweet):
        '''
        '''

 
    @abstractmethod
    def getLimitContext(self, res_text):
        '''
        '''

 
    def collect(self, total = -1, onlyText = False, includeRetweet = False):

        self.checkLimit()

        url, params = self.specifyUrlAndParams()
        params['include_rts'] = str(includeRetweet).lower()

        cnt = 0
        unavailableCnt = 0
        while True:
            res = self.session.get(url, params = params)
            if res.status_code == 503:
                if unavailableCnt > 10:
                    raise Exception('Twitter API error %d' % res.status_code)
 
                unavailableCnt += 1
                print ('Service Unavailable 503')
                self.waitUntilReset(time.mktime(datetime.datetime.now().timetuple()) + 30)
                continue
 
            unavailableCnt = 0
 
            if res.status_code != 200:
                raise Exception('Twitter API error %d' % res.status_code)
 
            tweets = self.pickupTweet(json.loads(res.text))
            if len(tweets) == 0:
                break
 
            for tweet in tweets:
                if (('retweeted_status' in tweet) and (includeRetweet is False)):
                    pass
                else:
                    if onlyText is True:
                        yield tweet['text']
                    else:
                        yield tweet
 
                    cnt += 1
                    if cnt % 100 == 0:
                        print ('%d件 ' % cnt)
 
                    if total > 0 and cnt >= total:
                        return
 
            params['max_id'] = tweet['id'] - 1
 
            if ('X-Rate-Limit-Remaining' in res.headers and 'X-Rate-Limit-Reset' in res.headers):
                if (int(res.headers['X-Rate-Limit-Remaining']) == 0):
                    self.waitUntilReset(int(res.headers['X-Rate-Limit-Reset']))
                    self.checkLimit()
            else:
                print ('not found  -  X-Rate-Limit-Remaining or X-Rate-Limit-Reset')
                self.checkLimit()
 
    def checkLimit(self):
        unavailableCnt = 0
        while True:
            url = "https://api.twitter.com/1.1/application/rate_limit_status.json"
            res = self.session.get(url)
 
            if res.status_code == 503:
                if unavailableCnt > 10:
                    raise Exception('Twitter API error %d' % res.status_code)
 
                unavailableCnt += 1
                print ('Service Unavailable 503')
                self.waitUntilReset(time.mktime(datetime.datetime.now().timetuple()) + 30)
                continue
 
            unavailableCnt = 0
 
            if res.status_code != 200:
                raise Exception('Twitter API error %d' % res.status_code)
 
            remaining, reset = self.getLimitContext(json.loads(res.text))
            if (remaining == 0):
                self.waitUntilReset(reset)
            else:
                break
 
    def waitUntilReset(self, reset):
        seconds = reset - time.mktime(datetime.datetime.now().timetuple())
        seconds = max(seconds, 0)
        print ('\n     =====================')
        print ('     == waiting %d sec ==' % seconds)
        print ('     =====================')
        sys.stdout.flush()
        time.sleep(seconds + 10)
 
    @staticmethod
    def bySearch(keyword):
        return TweetsGetterBySearch(keyword)
 
    @staticmethod
    def byUser(screen_name):
        return TweetsGetterByUser(screen_name)


# TweetsGetterBySearch
class TweetsGetterBySearch(TweetsGetter):

    def __init__(self, keyword):
        super(TweetsGetterBySearch, self).__init__()
        self.keyword = keyword
        
    def specifyUrlAndParams(self):
        url = 'https://api.twitter.com/1.1/search/tweets.json?'
        params = {'q':self.keyword, 'count':100}
        return url, params
 
    def pickupTweet(self, res_text):
        results = []
        for tweet in res_text['statuses']:
            results.append(tweet)
 
        return results
 
    def getLimitContext(self, res_text):
        remaining = res_text['resources']['search']['/search/tweets']['remaining']
        reset     = res_text['resources']['search']['/search/tweets']['reset']
 
        return int(remaining), int(reset)

# TweetsGetterByUser
class TweetsGetterByUser(TweetsGetter):
    def __init__(self, screen_name):
        super(TweetsGetterByUser, self).__init__()
        self.screen_name = screen_name

    def specifyUrlAndParams(self):
        url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
        params = {'screen_name':self.screen_name, 'count':200}
        return url, params

    def pickupTweet(self, res_text):
        '''
        res_text からツイートを取り出し、配列にセットして返却
        '''
        results = []
        for tweet in res_text:
            results.append(tweet)

        return results

    def getLimitContext(self, res_text):
        '''
        回数制限の情報を取得 （起動時）
        '''
        remaining = res_text['resources']['statuses']['/statuses/user_timeline']['remaining']
        reset     = res_text['resources']['statuses']['/statuses/user_timeline']['reset']

        return int(remaining), int(reset)


# DataFrame
columns = ['created_at', 'text', 'screen_name', 'username', 'description', 'follower', 'following']
# columns = ['screen_name', 'username', 'description', 'follower', 'following']
df = pd.DataFrame(columns=columns)


# ↓キーワードで取得
getter = TweetsGetter.bySearch('#####')


# ↓ユーザーを指定して取得 （screen_name）
# getter = TweetsGetter.byUser('#####')

cnt = 0
for tweet in getter.collect(total = 1000000):
    created_at = tweet['created_at']
    text = tweet['text']
    screen_name = tweet['user']['screen_name']
    username = tweet['user']['name']
    description = tweet['user']['description']
    follower = tweet['user']['followers_count']
    following = tweet['user']['friends_count']
    se = pd.Series([created_at, text, screen_name, username, description, follower, following], columns)
    # se = pd.Series([screen_name, username, description, follower, following], columns)
    df = df.append(se, ignore_index = True)


# ↓CSVデータを出力、ファイル名を入力する（拡張子は.csv）
# export CSV
filename = '#####.csv'
df.to_csv(filename, encoding = 'utf-8-sig')

In [None]:
#↓CSVデータをローカルにダウンロード、上記で指定したファイル名を入力する（拡張子は.csv）
from google.colab import files
files.download('#####.csv')