External packages needed:<br>
`pip install oauth2` (see https://pypi.org/project/oauth2/)<br>
`pip install gender-guesser` (see https://pypi.org/project/gender-guesser/)

In [1]:
NSAMPLES = 3000
DATAPATH = '../data/'

In [2]:
import oauth2 as oauth
import gender_guesser.detector as gender
from urllib import request
import json
from tqdm import tqdm
from datetime import datetime

In [3]:
FILENAME = 'tweet_sample_' + datetime.now().strftime('%Y%m%d%H%M') + '.json'
FILENAME

'tweet_sample_201907061227.json'

In [4]:
# api_key, api_secret, access_token_key, and access_token_secret
# obtained from https://developer.twitter.com/
from keys import *

In [5]:
oauth_token    = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)

signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()

http_method = "GET"

In [6]:
_debug = 0
http_handler  = request.HTTPHandler(debuglevel=_debug)
https_handler = request.HTTPSHandler(debuglevel=_debug)

In [7]:
def twitterreq(url, http_method, parameters):
  req = oauth.Request.from_consumer_and_token(oauth_consumer,
                                             token=oauth_token,
                                             http_method=http_method,
                                             http_url=url, 
                                             parameters=parameters)

  req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)

  headers = req.to_header()

  if http_method == "POST":
    encoded_post_data = req.to_postdata()
  else:
    encoded_post_data = None
    url = req.to_url()

  opener = request.OpenerDirector()
  opener.add_handler(http_handler)
  opener.add_handler(https_handler)
  
  response = opener.open(url, encoded_post_data)

  return response

In [8]:
d = gender.Detector()

def is_male(name):
    if len(name)>0:
        return(d.get_gender(name.split()[0])=='male')
    else:
        return(False)

def is_female(name):
    if len(name)>0:
        return(d.get_gender(name.split()[0])=='female')
    else:
        return(False)

In [9]:
def contains_required_info(r):
    if 'delete' in r or 'retweeted_status' in r:
        return(False)  # Ignore deletes and retweets
    elif 'lang' not in r or 'user' not in r or 'text' not in r:
        return(False)  # Bad tweet record
    elif r['lang']!='en':
        return(False)  # Ingore non-English tweets
    elif 'name' not in  r['user'] or 'id' not in r['user']:
        return(False)  # Bad user record
    elif not is_male(r['user']['name']) and not is_female(r['user']['name']):
        return(False)  # Gender not conclusive
    else:
        return(True)

In [10]:
def fetchsamples(n):
  with open(DATAPATH+FILENAME, 'w') as fi:
      url = "https://stream.twitter.com/1.1/statuses/sample.json"
      parameters = []
      response = twitterreq(url, "GET", parameters)
      for i in tqdm(range(n)):
          try:
              while True:
                  line = next(response)
                  if (contains_required_info(json.loads(line.strip()))):
                      break
          except StopIteration:
              print('Stopped after {} lines'.format(i))
              break
          fi.write("%s\n" % line.strip().decode('utf-8'))

In [11]:
fetchsamples(NSAMPLES)

100%|██████████| 3000/3000 [33:55<00:00,  1.97it/s] 
