# Get tweets with given hashtag from Twitter

For a user-specified hashtag, grabs tweets from the Twitter API and dumps them once an hour into a data folder. Also unfolds the user and entities part of the returned data, to give you nice clean data columns. 

This code will run over an hour at a time, then you'll need to rerun the last cell to get more data. A better way to do this is to run the code from a cron job, as a .py file

In [1]:
# Set up the link to Twitter
import twitter
import pandas as pd
import time
import datetime
import re


fsecret = open('../../../../../../twittersecrets.txt', 'r')
secrets = fsecret.readline()
ACCESS_TOKEN, ACCESS_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET = secrets.strip().split(',')
ACCESS_TOKEN = ACCESS_TOKEN.strip()
ACCESS_TOKEN_SECRET = ACCESS_TOKEN_SECRET.strip()
CONSUMER_KEY = CONSUMER_KEY.strip()
CONSUMER_SECRET = CONSUMER_SECRET.strip()

auth = twitter.oauth.OAuth(ACCESS_TOKEN, ACCESS_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
ta = twitter.Twitter(auth=auth)

In [2]:
# Get your first page of data
hashtag = '#qanon'
tweetjson = ta.search.tweets(q=hashtag)
dftweet = pd.DataFrame(tweetjson['statuses'])
dftweet = pd.concat([dftweet, dftweet['user'].apply(pd.Series).add_prefix('user_'),
                     dftweet['entities'].apply(pd.Series).add_prefix('entity_')], axis=1)
maxid = dftweet['id'].iloc[-1]-1 #ungodly hack because I haven't had enough coffee - google!
dftweet

Unnamed: 0,contributors,coordinates,created_at,entities,favorite_count,favorited,geo,id,id_str,in_reply_to_screen_name,...,user_statuses_count,user_time_zone,user_translator_type,user_url,user_utc_offset,user_verified,entity_hashtags,entity_symbols,entity_urls,entity_user_mentions
0,,,Fri Mar 30 04:54:02 +0000 2018,"{'hashtags': [{'text': 'TrustSessions', 'indic...",0,False,,979582501432774658,979582501432774658,,...,8127,,none,,,False,"[{'text': 'TrustSessions', 'indices': [64, 78]...",[],[],"[{'screen_name': '2runtherace', 'name': 'Runni..."
1,,,Fri Mar 30 04:54:02 +0000 2018,"{'hashtags': [{'text': 'HRCVideo', 'indices': ...",0,False,,979582499193020416,979582499193020416,,...,138167,Eastern Time (US & Canada),none,http://t.co/JYto7WCcQ8,-14400.0,False,"[{'text': 'HRCVideo', 'indices': [72, 81]}, {'...",[],[],"[{'screen_name': 'MichelleRineh12', 'name': 'M..."
2,,,Fri Mar 30 04:54:02 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,False,,979582498823917569,979582498823917569,,...,21284,Central Time (US & Canada),none,,-18000.0,False,[],[],[],"[{'screen_name': 'SBelle1950', 'name': 'Southe..."
3,,,Fri Mar 30 04:54:02 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,False,,979582498354204672,979582498354204672,,...,50291,America/Detroit,none,,-14400.0,False,[],[],[],"[{'screen_name': 'IWillRedPillU', 'name': 'Red..."
4,,,Fri Mar 30 04:54:01 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,False,,979582494956838912,979582494956838912,,...,45509,Eastern Time (US & Canada),none,,-14400.0,False,[],[],[],"[{'screen_name': 'prayingmedic', 'name': 'Pray..."
5,,,Fri Mar 30 04:53:55 +0000 2018,"{'hashtags': [{'text': 'TrustSessions', 'indic...",0,False,,979582470428360706,979582470428360706,,...,3857,Alaska,none,,-28800.0,False,"[{'text': 'TrustSessions', 'indices': [64, 78]...",[],[],"[{'screen_name': '2runtherace', 'name': 'Runni..."
6,,,Fri Mar 30 04:53:55 +0000 2018,"{'hashtags': [{'text': 'JeffSessions', 'indice...",0,False,,979582469304287232,979582469304287232,,...,188108,,none,,,False,"[{'text': 'JeffSessions', 'indices': [69, 82]}]",[],[],"[{'screen_name': 'bocavista2016', 'name': 'Boc..."
7,,,Fri Mar 30 04:53:52 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,False,,979582456356417536,979582456356417536,,...,62359,,none,,,False,[],[],[],"[{'screen_name': 'TNOutlaw2020', 'name': '✝⛪ O..."
8,,,Fri Mar 30 04:53:51 +0000 2018,"{'hashtags': [{'text': 'Qanon', 'indices': [86...",0,False,,979582453957386241,979582453957386241,,...,55170,,none,,,False,"[{'text': 'Qanon', 'indices': [86, 92]}]",[],[],"[{'screen_name': 'jackiec57', 'name': 'jaci c'..."
9,,,Fri Mar 30 04:53:46 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,False,,979582435070566402,979582435070566402,,...,3785,,none,,,False,[],[],[],"[{'screen_name': 'adjunctprofessr', 'name': 'J..."


In [9]:
# Now work backwards in time, grabbing more data each time
# This bit of code should take an hour to run... and then dump out a timestamped file.
for j in range(4):
    for i in range(180):
        try:
            print('{}: grabbing tweets backwards from {}'.format(i, maxid))
            tweetjson = ta.search.tweets(q=hashtag, max_id=maxid)
            dftweet = dftweet.append(pd.DataFrame(tweetjson['statuses']))
            maxid = dftweet['id'].iloc[-1]-1
            time.sleep(1)
        except:
            print('Dammit, twitter timed us out')
            break
            #time.sleep(10)

    #Send this batch of tweets to a CSV file
    outfile = '../data/qanon_tweets_{}.csv'.format(re.sub('[^\d]+', '_', str(datetime.datetime.now())))
    dftweet = pd.concat([dftweet, dftweet['user'].apply(pd.Series).add_prefix('user_'),
                         dftweet['entities'].apply(pd.Series).add_prefix('entity_')], axis=1)
    dftweet.to_csv(outfile, index=False)
    print('{} rows sent to file {}'.format(len(dftweet), outfile))
    print(dftweet[['id', 'created_at']].head())
    dftweet = pd.DataFrame([])            

    print('Got tweets, dumped file, napping for 15 mins')
    time.sleep(15*60)

0: grabbing tweets backwards from 979581628086284287
1: grabbing tweets backwards from 979581547346059263
2: grabbing tweets backwards from 979581474725814273
3: grabbing tweets backwards from 979581409051361279
4: grabbing tweets backwards from 979581356832206847
5: grabbing tweets backwards from 979581287496343551
6: grabbing tweets backwards from 979581226435710975
7: grabbing tweets backwards from 979581142109179904
8: grabbing tweets backwards from 979581087025418239
9: grabbing tweets backwards from 979581007111352319
10: grabbing tweets backwards from 979580935497814016
11: grabbing tweets backwards from 979580863070588927
12: grabbing tweets backwards from 979580783932334080
13: grabbing tweets backwards from 979580699865853951
14: grabbing tweets backwards from 979580638528462848
15: grabbing tweets backwards from 979580595331387391
16: grabbing tweets backwards from 979580546618707968
17: grabbing tweets backwards from 979580486908633087
18: grabbing tweets backwards from 979

151: grabbing tweets backwards from 979572567508946944
152: grabbing tweets backwards from 979572529084944385
153: grabbing tweets backwards from 979572475137806335
154: grabbing tweets backwards from 979572403805327359
155: grabbing tweets backwards from 979572333097639936
156: grabbing tweets backwards from 979572289690787839
157: grabbing tweets backwards from 979572253275734015
158: grabbing tweets backwards from 979572213161517056
159: grabbing tweets backwards from 979572153170341887
160: grabbing tweets backwards from 979572104520638463
161: grabbing tweets backwards from 979572052213489663
162: grabbing tweets backwards from 979571995368095748
163: grabbing tweets backwards from 979571944939929599
164: grabbing tweets backwards from 979571891177275391
165: grabbing tweets backwards from 979571807408750591
166: grabbing tweets backwards from 979571752241053695
167: grabbing tweets backwards from 979571692619030527
168: grabbing tweets backwards from 979571654677458943
169: grabb

KeyboardInterrupt: 