In [14]:
# You can store secrets in a file or in the form of environment variables during production.
# NEVER store keys directly on notebook.

with open('secrets.txt', 'r') as file:
    data = file.read().split('\n')

import tweepy 
import pandas as pd

df = []

consumer_key = data[0]
consumer_secret = data[1]
access_key = data[2]
access_secret = data[3]

In [15]:
def get_user_tweets(screen_name, num=0):
    #Twitter only allows access to a users most recent 3000 tweets with this method
    num = 3000 if num > 3000 else num
    max_num_per_call = 200

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    curr_count = max_num_per_call if num > max_num_per_call else num
    num -= curr_count

    new_tweets = api.user_timeline(screen_name=screen_name, count=curr_count)
    
    tweet = new_tweets[0]
    print(f"Location of username {screen_name} is: {tweet.user.location}\n")
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    print(f"{len(alltweets)} tweets downloaded so far")
    
    #keep grabbing tweets until there are no tweets left to grab
    while num > 0:
        print(f"Getting tweets before {oldest}")
        
        curr_count = max_num_per_call if num > max_num_per_call else num

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=curr_count, max_id=oldest)
        num -= curr_count
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
    df = pd.DataFrame(outtweets, columns=["id","created_at","text"])
    df.to_csv(f"user_{screen_name}.csv", index=False)
    print(df.head())

In [16]:
states = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'DC': 'District of Columbia',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming'
}

In [17]:
def get_query_tweets(query, num=0):
    #num = 3000 if num > 3000 else num
    max_num_per_call = 100

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    curr_count = max_num_per_call if num > max_num_per_call else num
    num -= curr_count

    new_tweets = api.search(q=query, count=curr_count)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    print(f"{len(alltweets)} tweets downloaded so far")

    #keep grabbing tweets until there are no tweets left to grab
    while num > 0:
        print(f"Getting tweets before {oldest}")
        
        curr_count = max_num_per_call if num > max_num_per_call else num

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.search(q=query, count=curr_count, max_id=oldest)
        num -= curr_count
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [[tweet.id_str, tweet.created_at, tweet.user.verified, tweet.user.screen_name, tweet.text.encode("utf-8"),\
                  tweet.user.location] for tweet in alltweets]
    df = pd.DataFrame(outtweets, columns=["id", "created_at", 'verified', 'username', "text",\
                                          "location"])
    df.to_csv(f"query_{query}.csv", index=False)
    
    return df

In [18]:
# pass in the search query
df_new = get_query_tweets("Super Bowl", 5000)
df_new.head()

100 tweets downloaded so far
Getting tweets before 1224042709477781503
200 tweets downloaded so far
Getting tweets before 1224042684999925760
300 tweets downloaded so far
Getting tweets before 1224042660366835714
400 tweets downloaded so far
Getting tweets before 1224042635603648511
500 tweets downloaded so far
Getting tweets before 1224042610320445441
600 tweets downloaded so far
Getting tweets before 1224042587943624703
700 tweets downloaded so far
Getting tweets before 1224042564883382272
800 tweets downloaded so far
Getting tweets before 1224042544213962759
900 tweets downloaded so far
Getting tweets before 1224042522630139904
1000 tweets downloaded so far
Getting tweets before 1224042500911980543
1100 tweets downloaded so far
Getting tweets before 1224042476366942210
1200 tweets downloaded so far
Getting tweets before 1224042455562981376
1300 tweets downloaded so far
Getting tweets before 1224042432569974788
1400 tweets downloaded so far
Getting tweets before 1224042412022079487
1

Unnamed: 0,id,created_at,verified,username,text,location
0,1224042733859409921,2020-02-02 18:51:44,False,pnix49,b'RT @maxcredits: Remember when the Super Bowl...,TN
1,1224042733712629760,2020-02-02 18:51:43,False,kevjones5,b'RT @finalfourcast: Head on over to our Insta...,"morgantown,wv"
2,1224042733658046465,2020-02-02 18:51:43,False,kewannadabrat,b'RT @kend0llass: this the first year i could ...,
3,1224042733326622721,2020-02-02 18:51:43,False,tinowilliams23,b'High five \xf0\x9f\x99\x8b\xf0\x9f\x8f\xbd\x...,
4,1224042733314224129,2020-02-02 18:51:43,False,ThisIsAnch,b'I can\xe2\x80\x99t wait until this day next ...,"Jacksonville, FL"


In [19]:
import re


In [21]:
#df_full = df.append(df_new, ignore_index = True)
df_full = df_new


In [23]:
df_full.drop_duplicates().to_csv('super_bowl_tweets.csv')