# TWITTER SCRAPER

A script to scrape for Twitter data using the Python package requests to retrieve the content and Beautifullsoup4 to parse the retrieved content. 
Search queries constructed with using Twitters advanced search: https://twitter.com/search-advanced

In [10]:
#Required Library

import requests
import random
from bs4 import BeautifulSoup
import logging
import pandas as pd
import datetime as dt
import time
import numpy as np
from multiprocessing import Pool
from functools import partial

In [11]:
from const import USER_AGENT_LIST

HEADER = {'User-Agent': random.choice(USER_AGENT_LIST)}

In [12]:
URL = "https://twitter.com/search?f=tweets&vertical=default&q={q}&max_position={pos}&l={lang}"
QUERY = "{} since%3A{} until%3A{}&src=typd"

In [13]:
def get_tweet_detail(tweet):
    result = {        
        'user_id' : tweet.find('span', 'username').text or '',
        'user_name' : tweet.find('strong', 'fullname').text or '',
        'text' : tweet.find('p', 'TweetTextSize').text or '',
        'replies': tweet.find(
            'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0',
        'retweet' : tweet.find(
            'span', 'ProfileTweet-action--retweet u-hiddenVisually').find(
            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0',
        'like' : tweet.find(
            'span', 'ProfileTweet-action--favorite u-hiddenVisually').find(
            'span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0',
        'timestamp' : dt.datetime.utcfromtimestamp(int(tweet.find(
            'span', '_timestamp')['data-time'])),
        'tweet_id' : tweet['data-item-id'] or "",
        'url' : tweet.find('div', 'tweet')['data-permalink-path'] or ""
    }
    return result

In [14]:
def get_tweets(html):
    soup = BeautifulSoup(html)
    tweets = soup.find_all('li','js-stream-item')
    if tweets:
        for tweet in tweets:
            try:
                yield get_tweet_detail(tweet)
            except AttributeError:
                pass

In [15]:
def query_single_page(url, retry=5):
    """
    Returns tweets from the given URL.
    :param url: The URL to get the tweets from
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    
    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ''
        tweets =list(get_tweets(html))
        if not tweets:
            if retry > 0:
                logging.info('No new tweet, retrying... (Attempts left: {}) \n{}'.format(retry, url))
                return query_single_page(url, retry-1)
            return [], None
        return tweets, "TWEET-{}-{}".format(tweets[-1]['tweet_id'], tweets[0]['tweet_id'])
    
    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    
    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(url, retry-1)
    
    logger.error('STOP LOADING.')
    return [], None

#a,b = query_single_page("https://twitter.com/search?q="自宅"%20AND%20"仕事"%20since%3A2006-03-21%20until%3A2018-08-17&l=ja&f=tweets&vertical=default")

In [16]:
def query_all_once(query, limit = None, lang = None):
    raw_query = query
    logger.info("Querying {}. Limit number of tweet: {}".format(query, limit))
    query = query.replace(' ','%20').replace('#','%23').replace(':','%3A')
    start = time.time()
    pos = None
    tweets = []
    
    try:
        while True:
            new_tweet, pos = query_single_page(URL.format(q=query,pos=pos,lang=lang))
            logger.info("{} - {} new tweet. Total {}".format(raw_query, len(new_tweet),len(tweets)))
            if len(new_tweet) == 0:
                logger.info("{} - Total: {}".format(raw_query,len(tweets)))
                return tweets
            
            tweets.extend(new_tweet)
            
            if limit and len(tweets) >= limit:
                logger.info("{} - Total: {}".format(raw_query,len(tweets)))
                return tweets
                             
    except KeyboardInterrupt:
        logger.info("{} - Program interrupted. Returning tweets gathered "
                     "so far...".formate(raw_query))
    end = time.time() 
    last_time = (end-start)/3600
    #logger.info("{0} - Total:: {1} \nScrape Time: {2:.3g} hour".format(raw_query,len(tweets),last_time))
    return tweets

In [17]:
#def wrapper(args):
    #return query_all_once(*args)

In [18]:
def query_pool(key_word, since=dt.date(2006,3,21), until=dt.date.today(), limit=None, lang=None, poolsize=20):
    
    start = time.time()
    no_of_days = (until-since).days
    if poolsize > no_of_days:
        poolsize = no_of_days
    
    if limit:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None
        
    date_ranges = [since + dt.timedelta(days=elem) for elem in np.linspace(0, no_of_days, poolsize+1)]
    date_blocks = zip(date_ranges[:-1], date_ranges[1:])
    queries = ["{} since:{} until:{}".format(key_word,since,until) for since,until in date_blocks]
    #para_tuples = [(query,limit_per_pool,lang) for query in queries]
    all_tweets = []
    try:
        pool = Pool(poolsize)
        try:
            for new_tweets in pool.imap_unordered(partial(query_all_once, limit=limit_per_pool, lang=lang), queries):
                    all_tweets.extend(new_tweets)
                    logger.info("----NEW TWEETS: {} ALL TWEETS: {}----".format(len(new_tweets),len(all_tweets)))

        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets gathered so far.')
    finally:
        pool.close()
        pool.join()
        
    end = time.time() 
    last_time = (end-start)/3600
    logger.info("Scrape Time: {0:.3g} hour".format(last_time))
    return all_tweets

In [19]:
def to_csv(tweets, filename='test_output.csv'):
    df = pd.DataFrame(tweets)
    df.to_csv(filename, sep=',', encoding = 'utf-8', index = False)    

In [25]:
key_word = '遊園地 since:2012-01-01 until:2013-01-01'
results = query_all_once(query=key_word,lang = 'ja',limit=None)

In [23]:
len(results)
#to_csv(results, filename='dataset/自宅AND警備_{}.csv'.format(dt.date.today()))

0

# Usage:
 - Function search_twitter(key_word, since, until, filename, limit, lang)
 - Input Parameter:
     - key_word: specific keyword you want to search in twitter
     - since: begin date, default 2006-03-21. Format yyyy-MM-dd
     - until: end date, default today. Format yyyy-MM-dd
     - filename: name of csv output file. If not provided, output will save to twitter_output.csv by default
     - limit: number of tweet data you want to get. If not provided, will get as much as possible
     - lang: Set this if you want to query tweets in a specific language. (For example Japanese is ja)
     
     
 - The script scrapes the following information: 
     + Username and Full Name 
     + Tweet-id 
     + Tweet text 
     + Tweet timestamp 
     + No. of likes 
     + No. of replies 
     + No. of retweets

In [21]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler('log/scraping.log.{}'.format(dt.date.today()))
fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(fh)

In [12]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# create file handler which logs even debug messages
fh = logging.FileHandler('log/scraping.log.{}'.format(dt.date.today()))
#fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
#ch = logging.StreamHandler()
#ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
#logger.addHandler(ch)
logger.addHandler(fh)

In [None]:
#SAMPLE SCRAPPING FOR HASHTAG WORLDCUP

key_word = '"自宅" AND "仕事"'
#since = '2018-06-15'
#until = '2018-07-16'
filename = 'test.csv'
lang = "ja"
limit = None

search_twitter(key_word=key_word, filename=filename, limit=limit, lang=lang)

Start time: 2018-08-15 05:15:37,328
End Time: 2018-08-15 14:12:26,523

In [None]:
dt_oj = dt.datetime.strptime("",%Y-%m)

In [19]:
#Read csv and analyse
files = ["twitter_scrape_自宅AND仕事.csv","twitter_scrape_自宅AND勉強.csv","twitter_scrape_自宅AND警備.csv"]
for f in files:
    df = pd.read_csv(f)
    print len(df)

393180
230723
323016


In [7]:
df = pd.read_csv("dataset/自宅AND警備_2018-08-21.csv")

In [8]:
len(df)

361121

In [26]:
df['year'] = df["timestamp"].apply(lambda x: dt.datetime.strptime(x,"%Y-%m-%d %H:%M:%S").year)

In [28]:
df['year'].value_counts().sort_index()

2007      301
2008      973
2009     9612
2010    25011
2011    50037
2012    36034
2013    39007
2014    50036
2015    25020
2016    50037
2017    45571
2018    29482
Name: year, dtype: int64

In [10]:
df = pd.read_csv("twitter_scrape_自宅AND警備.csv")
df['year'] = df["timestamp"].apply(lambda x: dt.datetime.strptime(x,"%Y-%m-%d %H:%M:%S").year)
df['year'].value_counts()

2018    221805
2017    101211
Name: year, dtype: int64

In [11]:
'自宅' and  in df['text'][0]

True

In [19]:
for t in df['text']:
    if not '自宅' and '警備' in t:
        print t