In [1]:
import os
import sys
import time
import datetime
import json

from tweepy import API
from tweepy import OAuthHandler
from tweepy import Cursor
from tweepy import TweepError

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException 
from selenium.common.exceptions import StaleElementReferenceException

In [2]:
def get_twitter_auth():
    """Setup Twitter Authentication.
    
    Return: tweepy.OAuthHandler object
    """
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    return auth
    
def get_twitter_client():
    """Setup Twitter API Client.
    
    Return: tweepy.API object
    """
    auth = get_twitter_auth()
    client = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
    return client

In [3]:
def makedir(screen_name):
    """Create subdirectory 'users/screen_name' to store mined data.
    
    Params
    -------
    screen_name : str
    """
    dirname = 'users/{}'.format(screen_name)

    try:
        os.makedirs(dirname, mode=0o755, exist_ok=True)
    except OSError:
        print('Directory {} already exists.'.format(dirname))
    except Exception as e:
        print('Error while creating directory {}'.format(dirname))
        print(e)
        sys.exit(1)

def get_user_tweets(screen_name, no_rt=True):
    """Get tweets for a given user (3,200 limit)
    
    Create a subdir named 'users'.
    In this subdir, a jsonl file will store all the tweets writen
    by the given user.
    
    Params
    -------
    screen_name : str    
    """
    # Make dir structure
    makedir(screen_name)

    fname = 'users/{0}/usr_timeline_{0}.jsonl'.format(screen_name)
    with open(fname, 'a') as f:
        for page in Cursor(client.user_timeline, screen_name=screen_name, count=200).pages(16): 
            for tweet in page:
                if no_rt:
                    if not tweet.retweeted and 'RT @' not in tweet.text:
                        f.write(json.dumps(tweet._json)+'\n')
                else:
                    f.write(json.dumps(tweet._json)+'\n')

In [4]:
def twitter_url(screen_name, no_rt, start, end):
    """Form url to access tweets via Twitter's search page.
    Params
    -------
    screen_name : str
    no_rt : bool
    start : datetime-onj
    end : datetime-obj
    
    Return: string
    """
    url1 = 'https://twitter.com/search?f=tweets&q=from%3A'
    url2 = screen_name + '%20since%3A' + start.strftime('%Y-%m-%d') 
    url3 = ''
    if no_rt:
        url3 = '%20until%3A' + end.strftime('%Y-%m-%d') + '%20&src=typd'
    else:
        url3 = '%20until%3A' + end.strftime('%Y-%m-%d') + '%20include%3Aretweets&src=typd'
    
    return url1 + url2 + url3
    
def increment_day(date, i):
    """Increment day object by i days.
    
    Params
    -------
    date : datetime-obj
    i : int
    
    Return: datetime object
    """
    return date + datetime.timedelta(days=i)

In [26]:
def get_all_user_tweets(screen_name, start, end, no_rt=True):
    """
    Params
    ------
    screen_name : str
    start : datetime-obj
    end : datetime-obj
    no_rt : bool
    
    """
    # Special parameters
    fname_tweet_ids = 'users/{0}/usr_tweetids_{0}.jsonl'.format(screen_name)
    
    # Make dir structure
    makedir(screen_name)
    
    # Selenium parames
    delay = 1  # time to wait on each page load before reading the page
    driver = webdriver.Chrome() 
    tweet_selector = 'li.js-stream-item'
    id_selector = '.time a.tweet-timestamp'
    
    ids_total = 0
    for day in range((end - start).days + 1):
        # Get Twitter search url
        startDate = increment_day(start, 0)
        endDate = increment_day(start, 1)
        url = twitter_url(screen_name, no_rt, startDate, endDate)

        driver.get(url)
        time.sleep(delay)
        
        try:
            found_tweets = driver.find_elements_by_css_selector(tweet_selector)
            increment = 10

            # Scroll through the Twitter search page
            while len(found_tweets) >= increment:
                print('scrolling down to load more tweets')
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                time.sleep(delay)
                found_tweets = driver.find_elements_by_css_selector(tweet_selector)
                increment += 10

            # Get the IDs for all Tweets
            ids = []
            with open(fname_tweet_ids, 'a') as fout:
                for tweet in found_tweets:
                    try:
                        tweet_id = tweet.find_element_by_css_selector(
                                    id_selector).get_attribute('href').split('/')[-1]
                        ids.append(tweet_id)
                        ids_total += 1
                    except StaleElementReferenceException as e:
                        print('lost element reference', tweet)
                        
                # Save ids to file
                data_to_write = list(set(ids))
                fout.write(json.dumps(data_to_write)+'\n')
            print('{} tweets found, {} total'.format(len(found_tweets), ids_total))
        
        except NoSuchElementException:
            print('no tweets on this day')

        start = increment_day(start, 1)
    
    # Close selenium driver
    driver.close()

In [15]:
user = 'AP'
fname = 'usr_timeline_{}.jsonl'.format(user)
with open(fname, 'w') as f:
    for page in Cursor(client.user_timeline, screen_name=user, count=200).pages(16): # limit of 3200 for user
        for tweet in page:
            if not tweet.retweeted and 'RT @' not in tweet.text:
                f.write(json.dumps(tweet._json)+'\n')
            else:
                print(tweet.text)

RT @APCentralRegion: "I was just doing what anyone should have done for another human being," says man who intervened in bar shooting. http…
RT @AP_Oddities: 'Worst skier alive' won't take a dive: Venezuelan who had only trained on wheels in the sun teeters on. https://t.co/Ket5v…
RT @AP_Oddities: "I might be dead, but I still don't want your germs:' Obituary celebrates a blunt 91-year-old Ohio woman. https://t.co/Fr6…
RT @APBusiness: .@jcpenney to close anywhere from 130 to 140 stores and 2 distribution centers. @ADInnocenzio reports https://t.co/Lv01ZUJk…
RT @AP_Oddities: Bingo: A rare baby bongo makes its debut at the Los Angeles Zoo. https://t.co/VFgCOlOoY4 #odd
RT @AP_Politics: #APFACTCHECK: Trump administration overstated elevation of women in the White House:
https://t.co/BBBcpGQD4O
RT @AP_Politics: A trio of generals have formed a stabilizing alliance in Trump's tumultuous administration. https://t.co/neh8iZFiJq
RT @AP_Politics: White House official says chief of staff Priebus 

In [6]:
from config import *
client = get_twitter_client()

In [25]:
screen_names = ['AP', 'FoxNews']
small_batch = False
start = datetime.datetime(2017, 1, 15)  
end = datetime.datetime(2017, 1, 16)    
fname_tweet_ids = 'all_ids.json'


if small_batch:
    for screen_name in screen_names:
        get_user_tweets(screen_name)
else:
    start = datetime.datetime(2017, 1, 15)  
    end = datetime.datetime(2017, 1, 16)    

    for screen_name in screen_names:
        get_all_user_tweets(screen_name, start, end, no_rt=True)

scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
34 tweets found, 34 total
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
50 tweets found, 84 total
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
147 tweets found, 147 total
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling down to load more tweets
scrolling 

In [28]:
f_authorship = 'users/authorship.csv'

with open(f_authorship, 'w') as fout:
    # Header
    fout.write('text,id,user_id\n')

    for screen_name in screen_names:
        if small_batch:
            fin = 'users/{0}/usr_timeline_{0}.jsonl'.format(screen_name)
            with open(fin, 'r') as f:
                for line in f:
                    tweet = json.loads(line)

                    fout.write('{0},{1},{2}\n'.format(
                        tweet['text'], tweet['id'], tweet['user']['id']))
                    
        else:
            fin = 'users/{0}/usr_tweetids_{0}.jsonl'.format(screen_name)
            with open(fin, 'r') as f:
                for line in f:
                    ids = json.loads(line)
                    
                    for tweetId in ids:
                        tweet = client.get_status(tweetId)
                        
                        fout.write('{0},{1},{2}\n'.format(
                            tweet.text, tweet.id, tweet.user.id))