In [2]:
# Library installation on Colab
# !pip install GetOldTweets3
# !pip install unshortenit

Collecting GetOldTweets3
  Downloading https://files.pythonhosted.org/packages/ed/f4/a00c2a7c90801abc875325bb5416ce9090ac86d06a00cc887131bd73ba45/GetOldTweets3-0.0.11-py3-none-any.whl
Collecting pyquery>=1.2.10
  Downloading https://files.pythonhosted.org/packages/78/43/95d42e386c61cb639d1a0b94f0c0b9f0b7d6b981ad3c043a836c8b5bc68b/pyquery-1.4.1-py2.py3-none-any.whl
Collecting cssselect>0.7.9
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Installing collected packages: cssselect, pyquery, GetOldTweets3
Successfully installed GetOldTweets3-0.0.11 cssselect-1.1.0 pyquery-1.4.1
Collecting unshortenit
  Downloading https://files.pythonhosted.org/packages/2c/e5/07c5281552f49167229e8c0c21b34353c68235843e53874f09e81a23d269/unshortenit-0.4.0.tar.gz
Building wheels for collected packages: unshortenit
  Building wheel for unshortenit (setup.py) ... [?25l[?25hdone
  Created wheel for uns

In [3]:
import GetOldTweets3 as got
import pandas as pd
import time
import re
import unshortenit

# **Helper Functions**

### Define function that returns a dataframe of top tweets of a keyword over a specified time period

> **Self_nested function in the *except* statement**





In [4]:
def retrieveTweets(keyword, year, startDate, endDate, maxTweets):
  querySearch = keyword
  y = str(year)
  start_date = y+'-'+startDate
  end_date = y+'-'+endDate
  max_tweets = maxTweets
  
  tic = time.perf_counter()
  tweetCriteria = got.manager.TweetCriteria().setQuerySearch(querySearch)\
                                            .setSince(start_date)\
                                            .setUntil(end_date)\
                                            .setTopTweets(True)\
                                            .setMaxTweets(max_tweets)

  tweets_df = pd.DataFrame()
  try:
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    toc = time.perf_counter()
    print(tweets[0].text, tweets[0].favorites, tweets[0].date)
    print('retrieving top tweets on {} took {:0.4f} seconds'.format(startDate, toc-tic))
    tweets_content = [[tweet.date, tweet.favorites, tweet.text, tweet.retweets] for tweet in tweets]
    tweets_df = pd.DataFrame(tweets_content, columns = ['Datetime', 'Favorites', 'Text', 'Retweets'])
  except:
    print('+'*30+'\nFAILED TO RETRIEVE TWEETS ON {}, wait 10 minutes. \n'.format(startDate)+'+'*30)
    time.sleep(600)
    tweets_df = retrieveTweets(keyword, year, startDate, endDate, maxTweets)

  return tweets_df

### Define function that returns the url(s) from a tweet




In [5]:
def findUrl(string): 
  regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

  result = re.findall(regex,string)
  urls = []
  for i in result:
    if i[0].endswith(','):
      urls.append(i[0][:-1])
    else:
      urls.append(i[0])
           
  return urls

### Define function that returns the list of urls extracted from an array of tweets

In [6]:
def extractUrl(tweets, url_summary):
  from unshortenit import UnshortenIt
  # initialze the unshortener
  unshortener = UnshortenIt(default_timeout=30)
  url_list = []
  for i in range(len(tweets)):
      
    if(i%2000 == 0):
      print('This is the {}th link'.format(i))

    text = (tweets.loc[i,'Text'])
    urls = findUrl(text)

    if(urls != []):
      for l in urls:
        # unshorten all the links
        try:
          l = unshortener.unshorten(l, force=True, unshorten_nested=True)
        except:
          continue
      
        datetime = tweets.loc[i,'Datetime']
        favs = tweets.loc[i,'Favorites']
        retweets = tweets.loc[i, 'Retweets'] 
        default = {'Datetime': datetime, 'Favs': favs, 'Retweets': retweets, 'Count': 1}
            
        if(l in url_summary):
          url_summary[l]['Favs'] += favs
          url_summary[l]['Retweets'] += retweets
          url_summary[l]['Count'] += 1
        else:
          url_summary[l] = default          
          url_list.append(l)
  return url_list

### Define function that returns the list of urls that have been filtered

In [20]:
def filterUrl(url_summary, url_list):
  for i in range(len(url_list)-1, -1, -1):
    
    if(i%2000 == 0):
      print('This is the {}th link'.format(i))

    link = url_list[i]
    if('pic.twitter.com' in link or 'www.instagram.com' in link or 'www.facebook.com' in link 
        or 'SoundCloud.com' in link or 'twitter.com' in link or 'www.amazon.com' in link 
        or 'rddt.co' in link or 'youtu' in link or 'witch.tv' in link or "tweet.photo" in link 
        or 'vimeo.com' in link):
      url_list.remove(link)
      url_summary.pop(link)
      # print('removed link: ' + link)
  return url_list

# **Execution**

### Retrieve tweets

In [8]:
dates = ['05-25','05-26','05-27','05-28','05-29','05-30','05-31','06-01','06-02','06-03','06-04','06-05','06-06','06-07',
         '06-08','06-09','06-10','06-11','06-12','06-13','06-14','06-15','06-16','06-17','06-18','06-19','06-20','06-21',
         '06-22','06-23','06-24','06-25','06-26','06-27','06-28','06-29','06-30','07-01','07-02','07-03','07-04','07-05',
         '07-06','07-07','07-08','07-09','07-10','07-11','07-12','07-13','07-14','07-15','07-16','07-17','07-18','07-19',
         '07-20','07-21','07-22','07-23','07-24','07-25','07-26','07-27']
         
keyword = '#blacklivesmatter'
year = 2020
maxTweets = 10000

tweets_by_date = dict()

for i in range(len(dates)-1):
  tweets = retrieveTweets(keyword, year, dates[i], dates[i+1], maxTweets)
  if(not tweets.empty):
    tweets_by_date[dates[i]] = tweets
  time.sleep(120)
  
  

A terribly sad day to be Amy Cooper Head of Insurance Investment Solutions at Franklin Templeton (@FTI_US) in NYC. Franklin Templeton Investments (212) 632-3000 280 Park Ave, New York, NY 10017 Do what you gotta do. #BlackLivesMatter #WhiteLies #WhiteTears 7883 2020-05-25 22:57:52+00:00
retrieving top tweets on 05-25 took 2.0491 seconds
George Floyd. Say his name. Remember his name. Remember his face. Share this image courtesy of @andresitoguzma. #justiceforfloyd #blacklivesmatter  192 2020-05-26 23:51:52+00:00
retrieving top tweets on 05-26 took 9.7667 seconds
and for all the people that are pulling the “I’m not interested in politics” card are just so heartless...black people deserve to be PROTECTED & HEARD #BlackLivesMatter 39 2020-05-27 23:59:57+00:00
retrieving top tweets on 05-27 took 62.4860 seconds
How do we get rid of Abusive Police Officers? It’s obvious they don’t care about cameras #BlackLivesMatter #NAACP 71 2020-05-28 23:59:55+00:00
retrieving top tweets on 05-28 took 306

### Combine all tweets together

In [9]:
all_tweets = pd.DataFrame(columns=['Datetime', 'Favorites', 'Text', 'Retweets'])    ## initialize the tweets dataframe for all the retrieved tweets

for i in dates:
  all_tweets = all_tweets.append(tweets_by_date.get(i))

In [10]:
# Verify raw data
print(len(all_tweets))
# all_tweets
# print([tweets_by_date[x].shape for x in tweets_by_date])

101407


In [11]:
# # Save all_tweets if a BACKUP is needed
# all_tweets.to_csv('blacklivesmatter_05-25_07-26_allTweets_v2.csv')
# all_tweets = pd.read_csv('/content/blacklivesmatter_05-25_07-26_allTweets_v2.csv')

### Process tweets

In [16]:
## There is no need to reverse the order of all_tweets, unlike the first verison,
## because it is already in the desired order (from earliest to latest)

# Initialize variables for processing raw data
all_url_list = [] # ordered from earliest to lastest
url_summary = dict() # format --> 'link': {info}   | {info} format --> {'Datetime', 'Favorites', 'Retweets', 'Count'}


In [17]:
# Processing
all_url_list = extractUrl(all_tweets, url_summary) # extract all urls without before filtering

This is the 0th link
This is the 2000th link
This is the 4000th link
This is the 6000th link
This is the 8000th link
This is the 10000th link
This is the 12000th link
This is the 14000th link
This is the 16000th link
This is the 18000th link
This is the 20000th link
This is the 22000th link
This is the 24000th link
This is the 26000th link
This is the 28000th link
This is the 30000th link
This is the 32000th link
This is the 34000th link
This is the 36000th link
This is the 38000th link
This is the 40000th link
This is the 42000th link
This is the 44000th link
This is the 46000th link
This is the 48000th link
This is the 50000th link
This is the 52000th link
This is the 54000th link
This is the 56000th link
This is the 58000th link
This is the 60000th link
This is the 62000th link
This is the 64000th link
This is the 66000th link
This is the 68000th link
This is the 70000th link
This is the 72000th link
This is the 74000th link
This is the 76000th link
This is the 78000th link
This is 

In [21]:
# Processing
all_url_list = filterUrl(url_summary, all_url_list) # remove irrelevant urls

This is the 4000th link
This is the 2000th link
This is the 0th link


In [22]:
print(len(all_url_list))

5344


### Generate data file

In [23]:
summary_list = []

for i in all_url_list:
    link = i
    datetime = url_summary[i]['Datetime']
    favs = url_summary[i]['Favs']
    retweets = url_summary[i]['Retweets']
    count = url_summary[i]['Count']

    summary_list.append([link, datetime, favs, retweets, count])

summary_df = pd.DataFrame(summary_list, columns=['Link', 'Datetime', 'Favs', 'Retweets', 'Count'])

# check
summary_df.head()
print(len(summary_df))

5344


In [24]:
summary_df.to_csv('blacklivesmatter_05-25_07-26_topTweets_urlSummary.csv')

# Tests and experiments

Set up searching criteria

In [None]:
querySearch = '#blacklivesmatter'
start_date = '2020-05-25'
max_tweets = 10000
file_counter = 1

Test - get 10000 tweets by query search

--

1000 - ~35 seconds 

10000 - ~10.25 minutes

In [None]:
import time
# time the excecution time for 1000 tweets
tic = time.perf_counter()
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(querySearch)\
                                            .setSince('2020-06-03')\
                                            .setUntil('2020-06-04')\
                                            .setTopTweets(True)\
                                            .setMaxTweets(max_tweets)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
toc = time.perf_counter()
print(tweets[0].text, tweets[0].favorites, tweets[0].date)
print('retrieving top tweets took {:0.4f} seconds'.format(toc-tic))

Right now outside @NYCMayor’s mansion. The silence is truly deafening. #BlackLivesMatter  474 2020-06-03 23:59:59+00:00
retrieving top tweets took 181.0698 seconds


In [None]:
print(len(tweets))
print(tweets[0].date)
print(tweets[-1].date)

5397
2020-06-03 23:59:59+00:00
2020-06-03 00:00:00+00:00


In [None]:
tic = time.perf_counter()
print(tic)
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(querySearch)\
                                            .setSince('2020-06-02')\
                                            .setUntil('2020-06-03')\
                                            .setMaxTweets(1000000)
tweets = got.manager.TweetManager.getTweets(tweetCriteria, receiveBuffer=storeBuffer, bufferLength=1000)
toc = time.perf_counter()
print(tweets[0].text, tweets[0].favorites, tweets[0].date)
print('retrieving all tweets on 6/2/2020 took {:0.4f} seconds'.format(toc-tic))

6713.098831614
6743.314410058
6783.17699547
6822.823443564
6863.342974963
6903.844389318
6944.985780068
6986.113742241
7025.961669107
7066.451637094
7107.471227207
7147.728528007
7188.85939416
An error occured during an HTTP request: HTTP Error 429: Too Many Requests
Try to open in browser: https://twitter.com/search?q=%23blacklivesmatter%20since%3A2020-06-02%20until%3A2020-06-03&src=typd


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
print('number of tweets: ' + str(len(tweets)))
print(tweets[0].date)
print(tweets[9999].date)

number of tweets: 10000
2020-06-02 23:59:59+00:00
2020-06-02 23:35:08+00:00


In [None]:
def storeBuffer(resultsAux):
  # Creating list of chosen tweet data
  text_tweets = [[tweet.date, tweet.favorites, tweet.text] for tweet in resultsAux]

  # Creation of dataframe from tweets    
  tweets_df = pd.DataFrame(text_tweets, columns = ['Datetime', 'Favorites', 'Text'])

  # Converting tweets dataframe to csv file
  global file_counter
  tweets_df.to_csv('blacklivesmatter_06-02_{}k-tweets.csv'.format(file_counter, sep=','))
  file_counter += 1

  print(time.perf_counter())
  time.sleep(10)

verify results

In [None]:
querySearch = '#blacklivesmatter'
start_date = '2020-06-12'
end_date = '2020-06-13'
max_tweets = 10000
  
tic = time.perf_counter()
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(querySearch)\
                                            .setSince(start_date)\
                                            .setUntil(end_date)\
                                            .setTopTweets(True)\
                                            .setMaxTweets(max_tweets)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
toc = time.perf_counter()
print(tweets[0].text, tweets[0].favorites, tweets[0].date)
print('retrieving top tweets on {} took {:0.4f} seconds'.format(dates[i], toc-tic))

Who in the USA is concerned about climate change? 49% of Whites 57% of Blacks 70% of Latinx Imagine what would be possible if POC didn’t have to deal with racism and could devote that energy to #ClimateSolutions... @ClimatePower #BlackLivesMatter  1120 2020-06-12 23:57:39+00:00
retrieving top tweets on 06-12 took 17.5493 seconds


In [None]:
print(len(tweets))
print(tweets[-1].text)

381
We knew in Philly that @Starbucks doesn't care about #BLM and they have truly shown it with this stand against people wearing anything that supports #blacklivesmatter #BoycottStarbucks


In [None]:
import datetime
x = datetime.datetime.today()
x

datetime.datetime(2020, 6, 21, 2, 7, 0, 974210)