In [9]:
# You can store secrets in a file or in the form of environment variables during production.
# NEVER store keys directly on notebook.

with open('secrets.txt', 'r') as file:
    data = file.read().split('\n')

import tweepy 
import pandas as pd

df = []

consumer_key = data[0]
consumer_secret = data[1]
access_key = data[2]
access_secret = data[3]

In [12]:
def get_query_tweets(query, num=0):
    #num = 3000 if num > 3000 else num
    max_num_per_call = 100

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    curr_count = max_num_per_call if num > max_num_per_call else num
    num -= curr_count

    new_tweets = api.search(q=query, count=curr_count)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    print(f"{len(alltweets)} tweets downloaded so far")

    #keep grabbing tweets until there are no tweets left to grab
    while num > 0:
        print(f"Getting tweets before {oldest}")
        
        curr_count = max_num_per_call if num > max_num_per_call else num

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.search(q=query, count=curr_count, max_id=oldest)
        num -= curr_count
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [[tweet.id_str, tweet.created_at, tweet.user.verified, tweet.user.screen_name,\
                  tweet.text.encode("utf-8"), tweet.user.location, tweet.user.listed_count,\
                  tweet.user.followers_count, tweet.retweet_count] for tweet in alltweets]
    df = pd.DataFrame(outtweets, columns=["id", "created_at", 'verified', 'username', "text",\
                                          "location", "listed_count", "follower_count",\
                                          "retweet_count"])

    df.to_csv(f"query_{query}.csv", index=False)
    
    return df

In [13]:
# pass in the search query
df_new = get_query_tweets("Coronavirus", 5000)
df_new.head()

100 tweets downloaded so far
Getting tweets before 1226604368281075713
200 tweets downloaded so far
Getting tweets before 1226604327143428096
300 tweets downloaded so far
Getting tweets before 1226604283904217087
400 tweets downloaded so far
Getting tweets before 1226604239436337153
500 tweets downloaded so far
Getting tweets before 1226604194288852993
600 tweets downloaded so far
Getting tweets before 1226604162009436160
700 tweets downloaded so far
Getting tweets before 1226604121387606022
800 tweets downloaded so far
Getting tweets before 1226604075980070912
900 tweets downloaded so far
Getting tweets before 1226604030815854591
1000 tweets downloaded so far
Getting tweets before 1226603984879849472
1100 tweets downloaded so far
Getting tweets before 1226603940243963904
1200 tweets downloaded so far
Getting tweets before 1226603894849052671
1300 tweets downloaded so far
Getting tweets before 1226603849638699019
1400 tweets downloaded so far
Getting tweets before 1226603802209312767
1

Unnamed: 0,id,created_at,verified,username,text,location,listed_count,follower_count,retweet_count
0,1226604413164408832,2020-02-09 20:30:55,False,mochihitsugi,b'RT @IGN: The Animal Crossing-themed Nintendo...,hell,0,68,150
1,1226604412426166272,2020-02-09 20:30:55,False,RocktagonBoss,b'RT @EM_KA_17: \xf0\x9f\x92\xa5 SUCCESS again...,"Las Vegas, NV",217,2813,141
2,1226604410786193420,2020-02-09 20:30:55,False,DavidLenlag,b'RT @jenniferatntd: Called a friend in #China...,,0,81,728
3,1226604410765271041,2020-02-09 20:30:55,False,ericammbenitez,b'RT @hanalfabeto: Jimena Bar\xc3\xb3n ya est\...,,6,2145,317
4,1226604410589020161,2020-02-09 20:30:55,False,GailWil87987475,b'RT @marklevinshow: Our new Middle East frien...,,0,218,440


In [15]:
df_full = df_new


In [16]:
df_full.drop_duplicates().to_csv('coronavirus_tweets.csv')