In [1]:
pip install twitterscraper

Note: you may need to restart the kernel to use updated packages.


In [3]:
#Import core function modules
import sys
import pandas as pd
import numpy as np
import datetime
import os #Library module for .csv file check
from twitterscraper import query_tweets

#---------------------------------------------------------------------

#This is a sample list of terms
query_list = [
    'COVID',
    'COVID-19',
    'Corona',
    'Coronavirus',
    'Quarantine',
    '#COVID',
    '#COVID-19',
    '#quarantine',
    '#Quarantine',
    '#covid19',
    '#socialdistancingnyc',
    '#socialdistancing',
    'asymptomatic',
    'contaminated',
    'spread',
    'essential',
    'medical',
    'mask',
    'n95',
    'N95'

]
#--------------------------------------------------------------------

#This is a sample filled-out custom parameter dictionary.
custom_params = {'Brooklyn':{'city': 'Brooklyn',
                         'lat': 40.650002,
                         'long': -73.949997,
                         'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,                                               
                         'start_month': 2,
                         'start_day': 1},
              'Manhattan': {'city' : 'Manhattan',
                          'lat': 40.758896,
                          'long': -73.985130,
                          'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
             'Queens': {'city' : 'Queens',
                          'lat': 40.742054,
                          'long': -73.769417,
                          'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'Bronx': {'city' : 'Bronx',
                          'lat': 40.829643,
                          'long': -73.926175,
                          'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'Hoboken': {'city' : 'Hoboken',
                          'lat': 40.7440,
                          'long': -74.0324,
                          'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'Jersey City': {'city' : 'Jersey City',
                          'lat': 40.7178,
                          'long': -74.0431,
                          'radius': '10mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'Westchester': {'city' : 'Westchester Co.',
                          'lat': 41.1220,
                          'long': -73.7949,
                          'radius': '20mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'East Hampton': {'city' : 'East Hampton',
                          'lat': 40.9634,
                          'long': -72.1848,
                          'radius': '20mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 2,
                         'start_day': 1},
            'NYC': {'city' : 'NYC',
                          'lat': 40.9634,
                          'long': -72.1848,
                          'radius': '35mi',
                         'queries': query_list,
                         'start_year': 2020,
                         'start_month': 1,
                         'start_day': 1}
             }

#Credit to Danielle Medellin, DSI11-NYC for assisting in implementation of custom parameter dictionary support
#--------------------------------------------------------------------

#Get tweets without geolocation
def get_tweets(query, year, month, day): 
    tweets = {} #Instantiates a new empty dictionary
    count = 0 #Instantiates an index generator
    for tweet in query_tweets(query,begindate=datetime.date(year,month,day)):
        chirp = {} #Instantiates a new empty dictionary for each pulled tweet
        chirp['tweet_id'] = tweet.tweet_id
        chirp['username'] = tweet.username
        chirp['text'] = tweet.text
        chirp['tweet_date'] = tweet.timestamp
        chirp['search_term'] = query
        chirp['city'] = np.NaN #Fills columns with NaNs for data cleaning at a later point. These items are NaNs so that
        chirp['lat'] = np.NaN #They will be flagged in an EDA search for missing values, instead of being strings with no information
        chirp['long'] = np.NaN
        chirp['radius'] = np.NaN
        chirp['query_start'] = pd.to_datetime(f"{year}/{month}/{day}") #Generates date as datetime object
        tweets.update({count : chirp}) #Sets count value to tweets keys, so that index is automatically generated 
        count += 1 #increments index up by 1
    return tweets
#--------------------------------------------------------------------

#Get tweets with geolocation
def get_tweets_geoloc(query, city, lat, long, radius, year, month, day): #Geolocation parameters defined by user in master function or dictionary
    tweets = {}
    count = 0
    for tweet in query_tweets(f"{query}, geocode:{lat},{long},{radius}",begindate=datetime.date(year,month,day)):
        chirp = {} #Generates tweet dictionary by calling on generated 'tweet' object attributes
        chirp['tweet_id'] = tweet.tweet_id
        chirp['username'] = tweet.username
        chirp['text'] = tweet.text
        chirp['tweet_date'] = tweet.timestamp
        chirp['search_term'] = query
        chirp['city'] = city
        chirp['lat'] = lat
        chirp['long'] = long
        chirp['radius'] = radius
        chirp['query_start'] = pd.to_datetime(f"{year}/{month}/{day}") #Generates date as datetime object
        tweets.update({count : chirp}) #Sets count value to tweets keys, so that index is automatically generated 
        count += 1 #increments index up by 1
    return tweets
#--------------------------------------------------------------------

#Generate dataframe from "tweets" dictionary generated after each query
def make_dataframe(dictionary):
    df = pd.DataFrame.from_dict(dictionary, orient='index') #Creates a dataframe from the input dictionary 'tweets' later in function
    return df #returns a temporary dataframe for concatenation later
#--------------------------------------------------------------------

#Query function using custom parameters
def get_query_dataframe_cp(custom_params):
    query_df = pd.DataFrame() #instantiate an empty dataframe
    for key in custom_params.keys(): #Generates a new query dataframe for each city used in the parameter dictionary
        for query in custom_params[key]['queries']: #Runs a unique query for each unique term in the query key
            tweets = get_tweets_geoloc(query,custom_params[key]['city'],custom_params[key]['lat'],custom_params[key]['long'],
                                       custom_params[key]['radius'],custom_params[key]['start_year'],custom_params[key]['start_month'],
                                       custom_params[key]['start_day'])
            df = make_dataframe(tweets) #creates temporary dataframe from independent query
            query_df = pd.concat([query_df,df],ignore_index = True) #concatenates temporary dataframe 'df' to master query dataframe 
    return query_df
#Credit Danielle Medellin for the majority of this code block section
#---------------------------------------------------------------

#Query function with geolocation but no custom parameters
def get_query_dataframe_geo(list_of_queries,city,lat,long,radius,year,month,day):
    query_df = pd.DataFrame()
    for query in list_of_queries: #Runs the same functions as custom parameters code
            tweets = get_tweets_geoloc(query,city,lat,long,radius,year,month,day)
            df = make_dataframe(tweets)
            query_df = pd.concat([query_df,df],ignore_index = True)
    return query_df
#-------------------------------------------------------------------

#Query function with no custom anything
def get_query_dataframe(list_of_queries,year,month,day):
    query_df = pd.DataFrame()
    for query in list_of_queries:
            tweets = get_tweets(query,year,month,day)
            df = make_dataframe(tweets)
            query_df = pd.concat([query_df,df],ignore_index = True)
    return query_df
#------------------------------------------------------------------

#Master function
def get_dataset():   
    #Main function switches
    custom_params_switch = input("Are you using a custom parameter dictionary? (y/n)") #Enables or disables custom dictionary use in query parameter switch block
    if str.lower(custom_params_switch) != 'y': #If the user is not using parameters
        
        #Geolocation parameter configuration
        geo_switch = input("Are you using geolocation? (y/n)") #If no custom dictionary, enables or disables geolocation in query parameter switch block
        if str.lower(geo_switch) == 'y':
            lat = float(input("Input Latitude:")) #Converts string input latitude to float value
            long = float(input("Input Longitude:"))
            city = input("Input city or neighborhood corresponding to coordinates:") #Allows filling of city values
            radius = input("Input radius and unit:")
            
        #Date parameter configuration
        use_custom_start_date = input("Using custom query start? (y/n)") #Enables or disables custom date parameter
        if str.lower(use_custom_start_date) == 'y':
            year = int(input("Input start year YYYY:"))
            month = int(input("Input start month MM:"))
            day = int(input("Input start day DD:"))
        else: #Sets default date
            
            #Default date parameters. Users can change defaults here
            year = 2019
            month = 12
            day = 1
 
    #CSV export parameter configuration
    export_csv_switch = input("Do you want to export the final dataframe to csv? (y/n)")
    if str.lower(export_csv_switch) == 'y': #Enables CSV switch block later if yes as well
        custom_csv_name = input("Input CSV export file name:") #Prompts user for file name
        if os.path.exists(f'datasets/{custom_csv_name}.csv') == True: #Check if file has already been created. If yes, prompt user to overwrite or make new file.
            overwrite_check = input ("File already exists--do you want to overwrite? (y/n)")
            if str.lower(overwrite_check) != 'y': #If user uses any value except y, the function will not proceed
                new_csv_name = custom_csv_name #creates new_csv_name variable = to old name
                while new_csv_name == custom_csv_name: #continues to reject file name until a unique name is created
                    new_csv_name = input("Input new output file name:")
                custom_csv_name = new_csv_name #Sets the file name to the new user input
    
    #Query parameter switch block
    if str.lower(custom_params_switch) == 'y':
        dataset = get_query_dataframe_cp(custom_params)
    else:
        if str.lower(geo_switch) == 'y':
            dataset = get_query_dataframe_geo(query_list,city,lat,long,radius,year,month,day)
        else:
            dataset = get_query_dataframe(query_list,year,month,day)
    
    #CSV export switch block
    if str.lower(export_csv_switch) == 'y':
        if os.path.exists('datasets') != True:
            os.mkdir('datasets')
        dataset.to_csv(f"./datasets/{custom_csv_name}.csv", index = False) #write csv to datasets folder
        print(f"Export complete, scraped {len(dataset.index)} tweets") #Prints completion statement including total tweets scraped       
    else:
        return dataset        

In [5]:
get_dataset()

INFO: queries: ['COVID, geocode:40.650002,-73.949997,10mi since:2020-02-01 until:2020-02-06', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-02-06 until:2020-02-11', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-02-11 until:2020-02-16', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-02-16 until:2020-02-21', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-02-21 until:2020-02-26', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-02-26 until:2020-03-02', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-02 until:2020-03-07', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-07 until:2020-03-12', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-12 until:2020-03-17', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-17 until:2020-03-22', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-22 until:2020-03-27', 'COVID, geocode:40.650002,-73.949997,10mi since:2020-03-27 until:2020-04-01', 'COVID, geocode:40.650002,-73.949997,10mi since: