In [30]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from pprint import pprint
import pandas as pd
import numpy as np


# Scraping Data from Twitter
The following block will use Selenium to scrape tweets from twitter

In [31]:
def infinity_scroll(driver,filename):
    try:
        for i in range(1000):
            
            soupObj = BeautifulSoup(driver.page_source,'html.parser')
            
            with open(filename,'a') as f: #remember to delete the file everytime you run it.
                f.write(str(soupObj))
            
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)

    except:
        return
        
    return


def scrape_tweets(twitter_handle,from_date,to_date,filename):
    
    url = f'https://twitter.com/search?q=(from%3A{twitter_handle})%20until%3A{to_date}%20since%3A{from_date}%20-filter%3Alinks%20-filter%3Areplies&src=typed_query&f=live'
    response = requests.get(url)
    driver = webdriver.Firefox()
    driver.get(url)
    time.sleep(10) 
    
    # scrolls down to the bottom of the page
    infinity_scroll(driver,filename)

    driver.quit()
    
    return

In [5]:
# twitter_handle = 'gsk'
# twitter_handle = 'sanofi'
# twitter handle = 'AstraZeneca'
twitter_handle = 'realdonaldtrump'

# twitter_handle = 'elonmusk'
from_date = '2020-01-01'
to_date = '2020-10-01'


scrape_tweets(twitter_handle,from_date,to_date,'trump_tweets_Jan_2020_Sept_30_2020.xml')

# Processing Tweet Data
The following code processes the data pulled from the XML file to create a listing of all Tweets.

In [13]:
def get_tweets2(filename):
    with open(filename,'r') as f:
        soupObj = BeautifulSoup(f,'html.parser')
    
    total_tweets = len(soupObj.findAll(class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o"))
    tweet_listing = []
    time_listing = []
    
    tweet_block = soupObj.find(class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o")
    
    for i in range(total_tweets):
        try:
            
            tweet = tweet_block.find('div',"css-901oao r-jwli3a r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")
            tweet_listing.append(tweet.text)
        
        except AttributeError:
            tweet_listing.append(np.NaN)
        
        try:
            timestamp = tweet_block.find('time')
            time_listing.append((timestamp.find_next('time').attrs)['datetime'])
        
        except AttributeError:
            time_listing.append(np.NaN)

        tweet_block = tweet_block.find_next(class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o")

    return pd.DataFrame({'Time':time_listing,'Tweet':tweet_listing})

In [19]:
# filepath = 'astrazeneca_tweets_Oct_2017_Sept_24_2020.xml'
# filepath = 'gsk_tweets_Oct_2017_Sept_24_2020.xml'
# filepath = 'pfizer_tweets_Oct_2017_Sept_24_2020.xml'
# filepath = 'elon_tweets_Sept_2019_Sept_2020.xml'
filepath = 'trump_tweets_Jan_2020_Sept_30_2020.xml'

df = get_tweets2(filepath)

In [25]:
df_copy = df.copy()

In [26]:
# MANUAL ADJUSTMENT
## Not sure why but all the tweets scraped are offset by 1 with the first date value missing.
## In the interest of time, I'm shifting the cells and manually adding it back:

df_copy['Time'] = df_copy[['Time']].shift()
df_copy.loc[[0,0],'Time'] = "2020-09-30T23:48:24.000Z"

In [28]:
df_copy.head()

Unnamed: 0,Time,Tweet
0,2020-09-30T23:48:24.000Z,Leaving Minneapolis for a quick stop in Duluth...
1,2020-09-30T21:50:51.000Z,Just landed in Minnesota. Hasn’t been won by a...
2,2020-09-30T21:16:54.000Z,HIGHEST CABLE TELEVISION RATINGS OF ALL TIME. ...
3,2020-09-30T20:27:35.000Z,So weird to watch @FoxNews interviewing only f...
4,2020-09-30T18:25:56.000Z,Radical Left Democrats are going CRAZY!


In [29]:
# df.to_csv('gsk_tweets_Oct_2017_Sept_24_2020.csv')
# df.to_csv('pfizer_tweets_Oct_2017_Sept_24_2020.csv')
# df.to_csv('astrazeneca_tweets_Oct_2017_Sept_24_2020.csv')
# df.to_csv('elon_tweets_Sept_2019_Sept_2020.csv')

df_copy.to_csv('trump_tweets_Jan_2020_Sep_30_2020.csv')

-------------
## Notes

In [205]:
# GetOldTweets3 no longer works since twitter changed the endpoint. Only been an issue for 4 days, but may be fixed.
# save code incase the developers fix it.
# import GetOldTweets3 as got
# tweetCriteria = got.manager.TweetCriteria().setQuerySearch("realDonaldTrump").setMaxTweets(2)
# tweet = got.manager.TweetManager.getTweets(tweetCriteria)


In [None]:
## Code used in testing
# variables for testing scrape_tweets()
# 'https://twitter.com/search?q=(from%3Arealdonaldtrump)%20until%3A2020-07-01%20since%3A2020-01-01%20-filter%3Alinks%20-filter%3Areplies&src=typed_query&f=live'
# twitter_handle = 'realdonaldtrump'
# from_date = '2020-01-01'
# to_date = '2020-07-01'

# other scrapers:
# snscrape's jsonl
# https://github.com/twintproject/twint/issues/918#issuecomment-696448934