In [1]:
import os
import pandas as pd
import snscrape.modules.twitter as sntwitter
import numpy as np
import datetime

Social Media Data Source Ingesting 
	To pull twitter posts this project used the snscrape python library to pass a query to Twitter's API and execute a search  (Git repository: https://github.com/JustAnotherArchivist/snscrape.) The Twitter API object, returned as a JSON ,includes metadata about the tweet such as date and time of the post, language, how many times the post has been liked, retweeted, or commented on. SNSCRAPE conditions and flattens the JSON as a python readable dictionary. While the API passes a Twitter geo object, a user must manually tag the location or have enabled GPS on the device or browser when posting.. The Twitter API also returns a Twitter geo object, depending on whether or not a user opts into to manually geotag a post, or their settings allow an exact GPS coordinates (https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/geo.)  A few practical problems arise with the use of the geo object from twitter which make the locational information of lesser interest . A user could post about events that occur in a disparate location from where their device is located, rendering the GPS coordinates even if enabled irrelevant. Twitter passes tagged places as a bounding box, which would require identifying the center point of the polygon to convert to a point representation. Previous studies have found user geotagging  rates of 0.85%(https://doi.org/10.5153/sro.3001) to 2.31%(Huang, Binxuan & Carley, Kathleen M. (2019). A Large-Scale Empirical Study of Geotagging Behavior on Twitter. In Proceedings of the 2019 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining, [pdf]). As such This study will not use the Twitter geo object to determine the relevant location but will leverage a geoparser to identify text based locational information


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  1 10:40:16 2021

@author: Arbo
"""

def twittsearch(text_query,since_date,until_date,tweetcount):
    tweets_list = []

    query = '{0} since:{1} until:{2} filter:has_engagement'.format(text_query, since_date, until_date)
    print(query)

    # Using TwitterSearchScraper to scrape data and append tweets to list

    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):

        if i>tweetcount:

            break
        tweets_list.append([tweet.date, tweet.id, tweet.content, 
                            tweet.lang,tweet.retweetedTweet,tweet.quotedTweet])

    # Creating a dataframe from the tweets list above

    tweets_df = pd.DataFrame(tweets_list, 
                             columns=['Datetime', 'TweetId', 'Text',
                                     "Language", "RTFrom", "QTFrom"])
    
    print("found {} tweets ranging from {} to {}".format(len(tweets_df),
                                                            tweets_df.Datetime.min(),tweets_df.Datetime.max()))
    
    print("dropping duplicates")   
    tweets_df = tweets_df.drop_duplicates(subset=['TweetId'])
    print("total of tweets now: {}".format(len(tweets_df)))
    print("english only")

    tweets_df = tweets_df[tweets_df["Language"]=='en']
    print("total of tweets now: {}".format(len(tweets_df)))
    tweets_df = tweets_df[tweets_df["RTFrom"].isna()]
    print("total of tweets now: {}".format(len(tweets_df)))
    tweets_df = tweets_df[tweets_df["QTFrom"].isna()]
    print("total of tweets now: {} ranging from {} to {}".format(len(tweets_df),
                                                                tweets_df.Datetime.min(),tweets_df.Datetime.max()))
    
    return tweets_df





In [6]:
scrapestart =  datetime.datetime.now() 
print("Scraping commenced at {}".format(scrapestart))
# =============================================================================
text_query = '("forest fire") OR wildfire OR bushfire OR \
(extreme heat) OR (record heat) OR heatwave OR ("heat wave") OR typhoon OR cyclone OR hurricane OR \
tornado OR ("storm surge") OR  blizzard OR snow OR ("ice storm") OR sleet OR thunderstorm OR \
hail OR flood OR flooding OR freeze OR frost OR (extreme cold) OR landslide OR tsunami OR ("tidal wave") OR \
earthquake OR eruption OR volcano OR lava OR lahar OR avalanche OR mudslide OR sinkhole'

since_date = '2021-06-07'
until_date = '2021-06-08'
tweetcount = 150000

twittsearch(text_query,since_date,until_date,tweetcount).to_csv(os.path.join(tweet_dir,'tweets_df_{0}_{1}.csv'.
                              format(since_date.replace('-',''),until_date.replace('-',''))),
                 index =False)
# =============================================================================
scrapend =  datetime.datetime.now()
print("Scraping ended at {}".format(scrapend))
print("Scraping time {}".format(scrapend-scrapestart))

Scraping commenced at 2021-07-14 23:02:03.098863
("forest fire") OR wildfire OR bushfire OR (extreme heat) OR (record heat) OR heatwave OR ("heat wave") OR typhoon OR cyclone OR hurricane OR tornado OR ("storm surge") OR  blizzard OR snow OR ("ice storm") OR sleet OR thunderstorm OR hail OR flood OR flooding OR freeze OR frost OR (extreme cold) OR landslide OR tsunami OR ("tidal wave") OR earthquake OR eruption OR volcano OR lava OR lahar OR avalanche OR mudslide OR sinkhole since:2021-07-04 until:2021-07-05 filter:has_engagement
found 64326 tweets ranging from 2021-07-04 00:00:00+00:00 to 2021-07-04 23:59:59+00:00
dropping duplicates
total of tweets now: 64326
english only
total of tweets now: 34134
total of tweets now: 34134
total of tweets now: 31630 ranging from 2021-07-04 00:00:00+00:00 to 2021-07-04 23:59:59+00:00
Scraping ended at 2021-07-14 23:13:06.883567
Scraping time 0:11:03.784704
