# Twitter scraping using Tweepy

Will scrape using a registered application I have - I guess there is a limit but we don't need too many Tweets.

In [2]:
import datetime
import pandas as pd
import pickle
import tweepy

def scrape_tweets(search_term, language, consumer_key, consumer_secret, access_token, access_secret, max_tweets = 10000, output_directory = '', csv_name = None, pickle_name = None, proxy = None):
    '''
    Wrapper for scraping the search results for some key term on Twitter.
    - search_term is what you want to search
    - language is the search language, e.g. "es" or "en"
    - consumer key, secret, access token and secret are the API keys for the search
    - max_tweets is the maximum number of Tweets to be returned - defaulting to 10 000 (NOTE: if cannot get enough, will just return the number it found)
    - output directory is the directory in which you wish to save - note that can just override this by declaring it in the csv name
    - csv_name is the name of the CSV of the data that will be saved at the end - if None then will save a timestamp like file
    - pickle_name is the name of the pickle of search results that is saved
    - proxy is the proxy you need to connect through
    '''
    # First will override the output directory to get it out of the way
    if output_directory:
        if '/' in csv_name:
            output_directory = ''
    
    # First have to set up authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    # Then have to declare the API with that authentification as well as proxy if we have it
    if proxy:
        api = tweepy.API(auth, proxy = proxy)
    else:
        api = tweepy.API(auth)
    
    # We declare a cursor to be able to retreive a large number of Tweets
    tweepy_cursor = tweepy.Cursor(api.search, q = search_term).items(max_tweets)
    
    # Then we have to step through to get the results
    search_results = []
    i = 0
    for result in tweepy_cursor:
        if i % max_tweets // 5 == 0:
            print('Have processed {} of {} tweets'.format(i, max_tweets))
        
        # Try to connect - if we are rejected will sleep for 10 seconds
        try:
            search_results.append(result)
            time.sleep(random.uniform(0, 1))
        except tweepy.TweepError:
            time.sleep(10)

        if i == 1:
            print('Have successfully got the first Tweet! Continuing...')
        
        i += 1

    print('Have retreived {} tweets'.format(len(search_results)))
    
    # Save these results to a pickle
    if pickle_name:
        print('Saving pickle...')
        with open(output_directory + pickle_name, 'wb') as pickle_file:
            pickle.dump(search_results, pickle_file)
        
    # Then we build a dataframe of useful stuff - NOTE that this section can easily be modified to add other Tweet objects like the number of replies etc.
    # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
    tweet_frame = pd.DataFrame([{'text' : tweet.text, 'user' : tweet.user.screen_name, 'is_retweet' : hasattr(tweet, 'retweeted_status'), 'creation_date' : tweet.created_at} for tweet in search_results])
    
    # Drop the duplicates before save - just for security
    tweet_frame.drop_duplicates(inplace = True)
    print('After dropping duplicates now have {} Tweets'.format(tweet_frame.shape[0]))
    
    # And finally save to CSV
    print('Saving CSV...')
    if csv_name:
        tweet_frame.to_csv(output_directory + csv_name, index = False, encoding = 'utf-8')
    else:
        csv_name = datetime.datetime.today().strftime('%Y%m%d') + '_tweets.csv'
        tweet_frame.to_csv(output_directory + csv_name, index = False, encoding = 'utf-8')
    
    return tweet_frame

In [3]:
results = scrape_tweets('Caixabank', 'es', consumer_key, consumer_secret, access_token, access_secret, 500, salida_dir, output_tweets, pickle_name, proxy = proxy)

Have processed 0 of 500 tweets
Have successfully got the first Tweet! Continuing...
Have retreived 500 tweets
After dropping duplicates now have 500 Tweets


In [4]:
results.head()

Unnamed: 0,creation_date,is_retweet,text,user
0,2018-04-03 13:19:33,True,RT @PAH_Sevilla: Hoy nuestro hermano #EduSeQue...,RicardoDubcek
1,2018-04-03 13:19:26,True,RT @7daigua: O REPÚBLICA O AFECTEM LA ECONOMIA...,lmmartim
2,2018-04-03 13:19:00,True,RT @7daigua: Voleu fer caure el Ibex35?\n\nDei...,7daigua
3,2018-04-03 13:18:11,True,"RT @SoydeDERECHAS: @caixabank Así de roñosas ,...",nimur2
4,2018-04-03 13:18:09,True,"RT @SoydeDERECHAS: @caixabank Así de roñosas ,...",mac_canovas
