In [1]:
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import sys
import time
import json
import configparser
import requests
import pickle
from TwitterAPI import TwitterAPI

In [2]:
# Fetch male/female names from Census.
def get_census_names():
    """ Fetch a list of common male/female names from the census.
    For ambiguous names, we select the more frequent gender."""
    males = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.male.first').text.split('\n')
    females = requests.get('http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first').text.split('\n')
    males_pct = dict([(m.split()[0].lower(), float(m.split()[1])) for m in males if m])
    females_pct = dict([(f.split()[0].lower(), float(f.split()[1])) for f in females if f])
    male_names = set([m for m in males_pct if m not in females_pct or males_pct[m] > females_pct[m]])
    female_names = set([f for f in females_pct if f not in males_pct or females_pct[f] > males_pct[f]])
    #pickle.dump(male_names, open('../data/collect/male_names.pkl','wb'))
    #pickle.dump(female_names, open('../data/collect/female_names.pkl','wb'))
    return male_names, female_names

male_names, female_names = get_census_names()
print('found %d female and %d male names' % (len(female_names), len(male_names)))
print('male name sample:', list(male_names)[:5])
print('female name sample:', list(female_names)[:5])

found 4014 female and 1146 male names
male name sample: ['emory', 'preston', 'napoleon', 'loren', 'dexter']
female name sample: ['shea', 'clorinda', 'kirstin', 'junie', 'jacelyn']


In [3]:
def get_twitter(config_file):
    """ Read the config_file and construct an instance of TwitterAPI.
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = configparser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('../twitter.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [4]:
# Sample U.S. tweets with names from Census. 
def get_first_name(tweet):
    """
    Get the first name from a twitter object.
    
    Params:
        tweet....The Twitter object from where to pick the user name.
    Returns:
        The user first name in lower letters.
    """
    if 'user' in tweet and 'name' in tweet['user']:
        parts = tweet['user']['name'].split()
        if len(parts) > 0:
            return parts[0].lower()

def get_realtime_tweets(twitter, limit, words, male_names, female_names, filename):
    """Retrieve real time tweets objects that match any of the words provided, are written in english and located 
    in the U.S.
    
    Store only those tweets that include a user name that matches either a male or female name from the census.
    
    Params:
        twitter........The TwitterAPI object.
        limit..........The number of tweets we want to retrieve.
        words..........A list of strings, defining all the words that a tweet can match.
        male_names.....List of all male names retrieved from the census.
        female_names...List of all female names retrieved from the census.
        filename.......Name of the file for storing the real time tweets picked.
    Returns:
        A list of dicts, one per tweet, containing all the tweet information
    """
    tweets = []
    while True:
        try:
            # Restrict to U.S.
            for response in twitter.request('statuses/filter',
                        {'track': words, 'locations':'-124.637,24.548,-66.993,48.9974', 'language': 'en'}):
                # Check if Twitter object contains user description.
                if 'user' in response:
                    # Obtain First name from user description dict.
                    name = get_first_name(response)
                    #Append tweet only if name is in any of male or female names
                    if name in male_names or name in female_names:
                        tweets.append(response)
                        if len(tweets) % 100 == 0:
                            print('found %d tweets' % len(tweets))
                        if len(tweets) >= limit:
                            return tweets
        except:
            print("Unexpected error:", sys.exc_info()[0])
            print(type(sys.exc_info()[0]))
    pickle.dump(tweets, open(filename, 'wb'))
    return tweets

In [5]:
# Create list of blockchain related words
words = ['ethereum', 'eth', 'bitcoin', 'btc', 'blockchain', 'cryptocurrencies', 'cryptocurrency', 'crypto', 'token', 'tokens',
        'solidity', 'litecoin', 'hyperledger','eos','dapp', 'dapps', 'smart contract', 'smart contracts', 'neo', 'miner', 'mining',
        'sidechain','pos','pow', 'dlt', 'polkadot']
filename = '../data/collect/real-time-tweets-test-dataset.pkl'
tweets = get_realtime_tweets(twitter, 500, words, male_names, female_names, filename)

found 100 tweets
found 200 tweets
found 300 tweets
found 400 tweets
found 500 tweets


In [5]:
# 1 - Read the screen names we want to build the network from.
def read_screen_names(filename):
    """
    Read a text file containing Twitter screen_names, one per line.

    Params:
        filename....Name of the file to read.
    Returns:
        A list of strings, one per screen_name, in the order they are listed
        in the file.

    Here's a doctest to confirm your implementation is correct.
    >>> read_screen_names('candidates.txt')
    ['DrJillStein', 'GovGaryJohnson', 'HillaryClinton', 'realDonaldTrump']
    """
    file = open(filename)
    r = sorted([l.strip() for l in file])
    file.close()
    return r

screen_names = read_screen_names('../data/collect/ethereum-accounts.txt')
print('Read screen names: %s' % screen_names)

Read screen names: ['ConsenSysAcad', 'VitalikButerin', 'binance', 'coinbase', 'ethereum', 'gavofyork', 'trufflesuite']


In [6]:
def robust_request(twitter, resource, params, max_tries=5):
    """ If a Twitter request fails, sleep for 15 minutes.
    Do this at most max_tries times before quitting.
    Args:
      twitter .... A TwitterAPI object.
      resource ... A resource string to request
      params ..... A parameter dict for the request, e.g., to specify
                   parameters like screen_name or count.
      max_tries .. The maximum number of tries to attempt.
    Returns:
      A TwitterResponse object, or None if failed.
    """
    for i in range(max_tries):
        request = twitter.request(resource, params)
        if request.status_code == 200 or request.status_code == 404:
            return request
        else:
            print('Got error %s \n with status code %s\nsleeping for 15 minutes.' % (request.text, request.status_code))
            sys.stderr.flush()
            time.sleep(61 * 15)

In [7]:
def robust_request_iterate(twitter, resource, params, max_pages=5):
    """ Function for managing pagination of results
    It will sequentially obtain all the pages using the cursor provided by Twittter.
    Args:
      twitter .... A TwitterAPI object.
      resource ... A resource string to request
      params ..... A parameter dict for the request, e.g., to specify
                   parameters like screen_name or count.
      max_pages .. The maximum number of pages to ask for.
    Returns:
      A TwitterResponse object, or None if failed.
    """
    ### TODO:
    cursor = -1
    # Add cursor parameter to params:
    params['cursor'] = cursor
    print(params)
    results = []
    while True:
        response = robust_request(twitter, resource, params)
        results.extend(response)
        print(response)
        next_cursor = response.json()['next_cursor']
        print(next_cursor)
        params['cursor'] = next_cursor
        if (next_cursor == 0):
            break
    return results

In [8]:
# 2 - Get the user's twitter accounts
def get_users_by_screen_name(twitter, screen_names):
    """Retrieve the Twitter user objects for each screen_name.
    Params:
        twitter........The TwitterAPI object.
        screen_names...A list of strings, one per screen_name
    Returns:
        A list of dicts, one per user, containing all the user information
        (e.g., screen_name, id, location, etc)

    >>> twitter = get_twitter()
    >>> users = get_users(twitter, ['twitterapi', 'twitter'])
    >>> [u['id'] for u in users]
    [6253282, 783214]
    """
    results = []
    for s in screen_names:
        results.extend(robust_request(twitter, "users/lookup", {'screen_name': s}))
    return results

In [9]:
users = sorted(get_users_by_screen_name(twitter, screen_names), key=lambda x: x['screen_name'])
print('found %d users with screen_names %s' %
    (len(users), str([u['screen_name'] for u in users])))
# print(users)

found 7 users with screen_names ['ConsenSysAcad', 'VitalikButerin', 'binance', 'coinbase', 'ethereum', 'gavofyork', 'trufflesuite']


In [None]:
# 3 - Get the user's each of this user follows (their "friends")
def get_friends(twitter, screen_name):
    """ Return a list of Twitter IDs for users that this person follows, up to 5000.
    See https://dev.twitter.com/rest/reference/get/friends/ids

    Args:
        twitter.......The TwitterAPI object
        screen_name... a string of a Twitter screen name
    Returns:
        A list of ints, one per friend ID, sorted in ascending order.

    Note: If a user follows more than 5000 accounts, we will limit ourselves to
    the first 5000 accounts returned.

    In this test case, I return the first 5 accounts that I follow.
    >>> twitter = get_twitter()
    >>> get_friends(twitter, 'aronwc')[:5]
    [695023, 1697081, 8381682, 10204352, 11669522]
    """
    results = []
    print("Inside: Requesting friends for screen_name %s" % screen_name)
    request = robust_request(twitter, "friends/ids", 
                              {'screen_name': screen_name, 'count':5000})
    if (request.status_code != 404):
        return sorted(request.json()['ids'])
    else:
        return str(request.status_code)

#print("Looking for friends of @%s" % users[2]['screen_name'])
#binance_friends = get_friends(twitter, users[2]['screen_name'])
#print('Complete response')
#print(binance_friends)

In [None]:
# Obtain all friends
def add_all_friends(twitter, users):
    """ Get the list of accounts each user follows.

    Store the result in each user's dict using a new key called 'friends'.

    Args:
        twitter...The TwitterAPI object.
        users.....The list of user dicts.
    Returns:
        Nothing

    >>> twitter = get_twitter()
    >>> users = [{'screen_name': 'aronwc'}]
    >>> add_all_friends(twitter, users)
    >>> users[0]['friends'][:5]
    [695023, 1697081, 8381682, 10204352, 11669522]
    """
    print("Requesting friends for a total of %s users" % len(users))
    for u in users:
        print("Outside: Requesting friends for screen_name %s" % u['screen_name'])
        # Make the requst only if the user is not protected, else store friends as an empty list
        if u['protected'] != True:
            response = get_friends(twitter, u['screen_name'])
            if response == "404":
                u['friends'] = []
            else:
                u['friends'] = response
        else:
            #
            u['friends'] = []
        
add_all_friends(twitter, users)

In [None]:
def print_num_friends(users):
    """Print the number of friends per candidate, sorted by candidate name.
    See Log.txt for an 
    example.
    Args:
        users....The list of user dicts.
    Returns:
        Nothing
    """
    print('\n'.join('%s %d' % (u['screen_name'], len(u['friends'])) for u in users))
        
print('Friends per candidate:')
print_num_friends(users)

In [None]:
def count_friends(users):
    """ Count how often each friend is followed.
    Args:
        users: a list of user dicts
    Returns:
        a Counter object mapping each friend to the number of candidates who follow them.
        Counter documentation: https://docs.python.org/dev/library/collections.html#collections.Counter

    In this example, friend '2' is followed by three different users.
    >>> c = count_friends([{'friends': [1,2]}, {'friends': [2,3]}, {'friends': [2,3]}])
    >>> c.most_common()
    [(2, 3), (3, 2), (1, 1)]
    """
    ###TODO
    result = Counter()
    for u in users:
        result.update(u['friends'])
    return result

friend_counts = count_friends(users)
print('Most common friends:\n%s' % str(friend_counts.most_common(5)))

In [None]:
# Who follows this person?
# Pick the first 2.000 people that follow this person.
# 3 - Get the user's each of this user follows (their "friends")
def get_followers(twitter, screen_name):
    """ Return a list of Twitter IDs for users that follow this person, up to 2000.
    See https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-list

    Args:
        twitter.......The TwitterAPI object
        screen_name... a string of a Twitter screen name
    Returns:
        A list of ints, one per friend ID, sorted in ascending order.

    Note: If a user has more than 5000 followers, we will limit ourselves to
    the first 5000 accounts returned.

    In this test case, I return the first 5 accounts that I follow.
    >>> twitter = get_twitter()
    >>> get_friends(twitter, 'aronwc')[:5]
    [695023, 1697081, 8381682, 10204352, 11669522]
    """
    print("Inside: Requesting followers for screen_name %s" % screen_name)
    request = robust_request(twitter, "followers/ids", 
                              {'screen_name': screen_name, 'count':5000})
    if (request.status_code != 404):
        return sorted(request.json()['ids'])
    else:
        return str(request.status_code)

In [None]:
# Obtain all followers for each user
def add_all_followers(twitter, users):
    """ Get all the followers a user has.

    Store the result in each user's dict using a new key called 'followers'.

    Args:
        twitter...The TwitterAPI object.
        users.....The list of user dicts.
    Returns:
        Nothing

    >>> twitter = get_twitter()
    >>> users = [{'screen_name': 'aronwc'}]
    >>> add_all_followers(twitter, users)
    >>> users[0]['followers'][:5]
    [695023, 1697081, 8381682, 10204352, 11669522]
    """
    for u in users:
        print("Outside: Requesting followers for screen_name %s" % u['screen_name'])
        if u['protected'] != True:
            response = get_followers(twitter, u['screen_name'])
            if response == "404":
                u['followers'] = []
            else:
                u['followers'] = response
        else:
            u['followers'] = []
        
add_all_followers(twitter, users)

In [None]:
def print_num_followers(users):
    """Print the number of followers per candidate, sorted by candidate name.
    Args:
        users....The list of user dicts.
    Returns:
        Nothing
    """
    print('\n'.join('%s %d' % (u['screen_name'], len(u['followers'])) for u in users))
        
print('Followers per candidate:')
print_num_followers(users)

In [None]:
def count_friends_and_followers(users):
    """ Count how often each user is followed or follows the defined users.
    Args:
        users: a list of user dicts
    Returns:
        a Counter object mapping each friend to the number of candidates who follow them.
        Counter documentation: https://docs.python.org/dev/library/collections.html#collections.Counter

    In this example, friend '2' is followed by three different users.
    >>> c = count_friends([{'friends': [1,2]}, {'friends': [2,3]}, {'friends': [2,3]}])
    >>> c.most_common()
    [(2, 3), (3, 2), (1, 1)]
    """
    ###TODO
    result = Counter()
    for u in users:
        result.update(u['friends'])
        result.update(u['followers'])
    return result

friend_and_followers_counts = count_friends_and_followers(users)
most_common_10 = friend_and_followers_counts.most_common(10)
print('Most common friends and followers:\n%s' % str(friend_and_followers_counts.most_common(10)))

In [None]:
# print(type(most_common_100[0]))
# print(most_common_100[0][0])
# Store in a list all the id's of the 100 most common users
most_common_10_ids = [user[0] for user in most_common_10]
print(most_common_10_ids)

In [None]:
# 4 - For each of the most common 100 users, obtain the user object and append it to the list of users' dict.
def get_users_by_ids(twitter, ids):
    """Retrieve the Twitter user objects for each id.
    Params:
        twitter........The TwitterAPI object.
        ids............A list of strings, one per id
    Returns:
        A list of dicts, one per user, containing all the user information
        (e.g., screen_name, id, location, etc)

    >>> twitter = get_twitter()
    >>> users = get_users(twitter, ['twitterapi', 'twitter'])
    >>> [u['id'] for u in users]
    [6253282, 783214]
    """
    results = robust_request(twitter, "users/lookup", {'user_id': ids})
    return sorted(results.json(), key=lambda x: x['screen_name'])

new_users = get_users_by_ids(twitter, most_common_10_ids)
print('found %d users with screen_names %s' %
    (len(new_users), str([u['screen_name'] for u in new_users])))

In [None]:
add_all_friends(twitter, new_users)
add_all_followers(twitter, new_users)

In [None]:
type(new_users)
a = users + new_users
users_total = users + new_users

In [None]:
def store_users(users, filename):
    pickle.dump(users, open(filename, 'wb'))
    
store_users(users_total, '../data/collect/users.pkl')

In [None]:
def read_users(filename):
    return pickle.load(open(filename, 'rb'))

users_read = read_users('../data/collect/users.pkl')

In [None]:
print(type(users_read))
print(len(users_read))

In [None]:
users_read == users_total