In [2]:
# Guessing gender
# Collect 1500 tweets matching words related to Blockchain
import configparser
import sys
import pickle
from collections import Counter
from TwitterAPI import TwitterAPI

### 1) Read census_names and tweets

In [3]:
def read_census_names():
    """
    Read census names collected in the collect python script.

    Returns:
        Two lists of male_names and female_names
    """
    male_names = pickle.load(open('../data/collect/male_names.pkl', 'rb'))
    female_names = pickle.load(open('../data/collect/female_names.pkl', 'rb'))
    return male_names, female_names

# 0 - Establish twitter connection and read all the names picked from the U.S. census.
male_names, female_names = read_census_names()
print('found %d female and %d male names' % (len(female_names), len(male_names)))

found 4014 female and 1146 male names


In [4]:
def get_twitter(config_file):
    """ Read the config_file and construct an instance of TwitterAPI.
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = configparser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('../twitter.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [5]:
def read_real_time_tweets(filename):
    """Read real time tweets retrieved during collect phase

    Params:
        filename.....The file where the tweets are stored.
    Returns:
        The list of real time tweets
    """
    return pickle.load(open(filename, 'rb'))

In [6]:
def get_first_name(tweet):
    """
    Get the first name from a twitter object.
    
    Params:
        tweet....The Twitter object from where to pick the user name.
    Returns:
        The user first name in lower letters.
    """
    if 'user' in tweet and 'name' in tweet['user']:
        parts = tweet['user']['name'].split()
        if len(parts) > 0:
            return parts[0].lower()

In [7]:
filename = '../data/collect/real-time-tweets.pkl'
tweets = read_real_time_tweets(filename)
print(len(tweets))

5000


In [8]:
print('sampled %d tweets' % len(tweets))
print('top names:', Counter(get_first_name(t) for t in tweets).most_common(10))

sampled 5000 tweets
top names: [('john', 74), ('michael', 60), ('chris', 52), ('mike', 47), ('kevin', 47), ('james', 46), ('ryan', 42), ('jeff', 41), ('david', 38), ('brian', 36)]


In [9]:
test_tweet = tweets[1]
print('test tweet:\n\tscreen_name=%s\n\tname=%s\n\tdescr=%s\n\ttext=%s' %
      (test_tweet['user']['screen_name'],
       test_tweet['user']['name'],
       test_tweet['user']['description'],
       test_tweet['text']))
print('top languages:', Counter(t['lang'] for t in tweets).most_common(4))

test tweet:
	screen_name=MekaPye100
	name=Tomeka  Dorsey
	descr=I AM BY ALL MEANS A TENACIOUS INDIVIDUAL!
	text=@JTthepodcaster It is, my mother made me watch it 1001 times...if they ever remake it you should try out for the part
top languages: [('en', 5000)]


### 2) Tokenize tweets

In [16]:
import re

def tokenize(string, lowercase, keep_punctuation, prefix, collapse_urls, collapse_mentions):
    """ 
    Split a string into tokens.
    If keep_internal_punct is False, then return only the alphanumerics (letters, numbers and underscore).
    If keep_internal_punct is True, then also retain punctuation that
    is inside of a word. E.g., in the example below, the token "isn't"
    is maintained when keep_internal_punct=True; otherwise, it is
    split into "isn" and "t" tokens
    
    Params:
        string................The string that needs to be tokenized.
        lowercase.............Boolean indicating if we want the text to be convert to lowercase.
        keep_punctuation......Boolean indicating if we want to keep punctuation
        prefix................Prefix to add to each obtained token. (will use for identifying what part is being tokenized, e.g. prefix d= for description)
        collapse_urls.........Boolean indicating if we ant to collapse the urls in the text. (e.g. @something)
        collapse_meentions....Boolean indicating if we ant to collapse the mmentions in the text. (e.g. #smth)
    Returns:
        An array containing the tokenized string.
    """
    if not string:
        return []
    if lowercase:
        string = string.lower()
    tokens = []
    if collapse_urls:
        string = re.sub('http\S+', 'THIS_IS_A_URL', string)
    if collapse_mentions:
        string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
    if keep_punctuation:
        tokens = string.split()
    else:
        tokens = re.sub('\W+', ' ', string).split()
    if prefix:
        tokens = ['%s%s' % (prefix, t) for t in tokens]
    return tokens

In [17]:
tokenize(test_tweet['user']['description'], lowercase=True,
         keep_punctuation=False, prefix='d=',
         collapse_urls=True, collapse_mentions=True)

['d=i',
 'd=am',
 'd=by',
 'd=all',
 'd=means',
 'd=a',
 'd=tenacious',
 'd=individual']

In [None]:
def tweet2tokens(tweet, use_descr=True, lowercase=True, keep_punctuation=True, descr_prefix='d=', collapse_urls=True, collapse_mentions=True):
    """
    Convert a tweet into a list of tokens, from the tweet text and optionally the
    user description.
    
    Params:
        tweet.................The tweet that needs to be tokenized.
        user_descr............Boolean to indicate if we want to tokenize the user description too.
        lowercase.............Boolean indicating if we want the text to be convert to lowercase.
        keep_punctuation......Boolean indicating if we want to keep punctuation
        descr_prefix..........Prefix to add to the tokenization of the description.
        collapse_urls.........Boolean indicating if we ant to collapse the urls in the text. (e.g. @something)
        collapse_meentions....Boolean indicating if we ant to collapse the mmentions in the text. (e.g. #smth)
    """
    # When tokenizing the text, do not add any prefix.
    tokens = tokenize(tweet['text'], lowercase, keep_punctuation, None, collapse_urls, collapse_mentions)
    if use_descr:
        tokens.extend(tokenize(tweet['user']['description'], lowercase,
                               keep_punctuation, descr_prefix,
                               collapse_urls, collapse_mentions))
    return tokens