In [1]:
from twittercrawler.crawlers import RecursiveCrawler
from twittercrawler.search import get_time_termination
from twittercrawler.utils import load_json_result

# 1. Setup TwitterCrawler

### Initialize and authenticate TwitterCrawler

In [2]:
tcs = RecursiveCrawler()

### Connect to a file

   * Tweets will be written to this file
   * If the file exists then new content will be appended to the file

In [3]:
tcs.connect_to_file("sample.txt") # export collected tweets and retweets to 'sample.txt'

### Authenticate TwitterCrawler

In [4]:
tcs.authenticate("api_key.json") # your API keys in a JSON file (see format sample in the main README)

Authentication was successful!


# 2. Search for events

## i.) Set search parameters

In [5]:
query = " OR ".join(["@CNN","@BBC","@guardian","@nytimes","#BREAKING"])

In [6]:
search_params = {
    "q":query,
    "result_type":'recent',
    "count":100
}

In [7]:
tcs.set_search_arguments(search_args=search_params)

{'result_type': 'recent', 'q': '@CNN OR @BBC OR @guardian OR @nytimes OR #BREAKING', 'count': 100}


## ii.) Implement filter function based on time

my_created_at="Mon Feb 18 00:00:00 +0000 2020"
time_terminator =  get_time_termination(my_created_at)

### termination (collect tweets from the last 5 minutes)

In [8]:
import datetime

In [9]:
now = datetime.datetime.now()
time_str = (now-datetime.timedelta(seconds=300)).strftime("%a %b %d %H:%M:%S +0000 %Y")
print(time_str)
time_terminator =  get_time_termination(time_str)

Tue Feb 18 09:11:18 +0000 2020


## iii.) Recursive search

   * Here your search starts at a specific time. It is the current time if you does not set any **current_max_id** parameter
   * Then the search tries to explore past events that match your search parameters
   * The search terminates if you:
      * set **term_func**: events older than the first event that matches this termination function won't be returned. For example you can set a time lower bound for your search.
      * all events matching your search parameters have been returned
      * **interrupt the execution**

In [10]:
tcs.search(wait_for=3, term_func=time_terminator, feedback_time=60)

No former request were made!


(1229694847096934400, 1229696134957846528, 307)

In [11]:
tcs.close()

Connection was closed successfully!


# 3. Load exported messages

In [12]:
messages = load_json_result("sample.txt")
print("Hits:", len(messages))

Hits: 1807


In [13]:
print(messages[0]["created_at"])
print()
print(messages[0]["text"])
print()
print(messages[0]["user"])

Tue Feb 18 09:13:31 +0000 2020

RT @clearticulation: @MeEf3112 @CraigGernhardt @al3zzaw @UNHumanRights @BBCWorld @guardian @CNN @Marwanhailan @Doranimated @azhardhia_80 @D…

{'screen_name': 'Iraq_for_Iraqis', 'profile_sidebar_fill_color': 'DDEEF6', 'default_profile': True, 'followers_count': 560, 'entities': {'description': {'urls': []}}, 'is_translation_enabled': False, 'following': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1200040058121834498/7FAhJxIN_normal.jpg', 'id': 1200039659205775360, 'description': 'العراق للعراقيين ✌🏻🇮🇶\nIraq for Iraqis ✌🏻🇮🇶', 'lang': None, 'contributors_enabled': False, 'profile_text_color': '333333', 'statuses_count': 2808, 'protected': False, 'friends_count': 1982, 'has_extended_profile': False, 'notifications': False, 'created_at': 'Thu Nov 28 13:12:24 +0000 2019', 'follow_request_sent': False, 'favourites_count': 5041, 'is_translator': False, 'profile_use_background_image': True, 'profile_background_image_url': None, 'profile_link_