In [2]:
# Guessing gender
# Collect 1500 tweets matching words related to Blockchain
import configparser
import sys
import pickle
from collections import Counter
from TwitterAPI import TwitterAPI

### 1) Read census_names and tweets

In [3]:
def read_census_names():
    """
    Read census names collected in the collect python script.

    Returns:
        Two lists of male_names and female_names
    """
    male_names = pickle.load(open('../data/collect/male_names.pkl', 'rb'))
    female_names = pickle.load(open('../data/collect/female_names.pkl', 'rb'))
    return male_names, female_names

# 0 - Establish twitter connection and read all the names picked from the U.S. census.
male_names, female_names = read_census_names()
print('found %d female and %d male names' % (len(female_names), len(male_names)))

found 4014 female and 1146 male names


In [4]:
def get_twitter(config_file):
    """ Read the config_file and construct an instance of TwitterAPI.
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = configparser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('../twitter.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [5]:
def read_real_time_tweets(filename):
    """Read real time tweets retrieved during collect phase

    Params:
        filename.....The file where the tweets are stored.
    Returns:
        The list of real time tweets
    """
    return pickle.load(open(filename, 'rb'))

In [6]:
def get_first_name(tweet):
    """
    Get the first name from a twitter object.
    
    Params:
        tweet....The Twitter object from where to pick the user name.
    Returns:
        The user first name in lower letters.
    """
    if 'user' in tweet and 'name' in tweet['user']:
        parts = tweet['user']['name'].split()
        if len(parts) > 0:
            return parts[0].lower()

In [7]:
filename = '../data/collect/real-time-tweets.pkl'
tweets = read_real_time_tweets(filename)
print(len(tweets))

5000


In [8]:
print('sampled %d tweets' % len(tweets))
print('top names:', Counter(get_first_name(t) for t in tweets).most_common(10))

sampled 5000 tweets
top names: [('john', 74), ('michael', 60), ('chris', 52), ('mike', 47), ('kevin', 47), ('james', 46), ('ryan', 42), ('jeff', 41), ('david', 38), ('brian', 36)]


In [9]:
test_tweet = tweets[1]
print('test tweet:\n\tscreen_name=%s\n\tname=%s\n\tdescr=%s\n\ttext=%s' %
      (test_tweet['user']['screen_name'],
       test_tweet['user']['name'],
       test_tweet['user']['description'],
       test_tweet['text']))
print('top languages:', Counter(t['lang'] for t in tweets).most_common(4))

test tweet:
	screen_name=MekaPye100
	name=Tomeka  Dorsey
	descr=I AM BY ALL MEANS A TENACIOUS INDIVIDUAL!
	text=@JTthepodcaster It is, my mother made me watch it 1001 times...if they ever remake it you should try out for the part
top languages: [('en', 5000)]


### 2) Tokenize tweets

In [16]:
import re

def tokenize(string, lowercase, keep_punctuation, prefix, collapse_urls, collapse_mentions):
    """ 
    Split a string into tokens.
    If keep_internal_punct is False, then return only the alphanumerics (letters, numbers and underscore).
    If keep_internal_punct is True, then also retain punctuation that
    is inside of a word. E.g., in the example below, the token "isn't"
    is maintained when keep_internal_punct=True; otherwise, it is
    split into "isn" and "t" tokens
    
    Params:
        string................The string that needs to be tokenized.
        lowercase.............Boolean indicating if we want the text to be convert to lowercase.
        keep_punctuation......Boolean indicating if we want to keep punctuation
        prefix................Prefix to add to each obtained token. (will use for identifying what part is being tokenized, e.g. prefix d= for description)
        collapse_urls.........Boolean indicating if we ant to collapse the urls in the text. (e.g. @something)
        collapse_meentions....Boolean indicating if we ant to collapse the mmentions in the text. (e.g. #smth)
    Returns:
        An array containing the tokenized string.
    """
    if not string:
        return []
    if lowercase:
        string = string.lower()
    tokens = []
    if collapse_urls:
        string = re.sub('http\S+', 'THIS_IS_A_URL', string)
    if collapse_mentions:
        string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
    if keep_punctuation:
        tokens = string.split()
    else:
        tokens = re.sub('\W+', ' ', string).split()
    if prefix:
        tokens = ['%s%s' % (prefix, t) for t in tokens]
    return tokens

In [17]:
tokenize(test_tweet['user']['description'], lowercase=True,
         keep_punctuation=False, prefix='d=',
         collapse_urls=True, collapse_mentions=True)

['d=i',
 'd=am',
 'd=by',
 'd=all',
 'd=means',
 'd=a',
 'd=tenacious',
 'd=individual']

In [18]:
def tweet2tokens(tweet, use_descr=True, lowercase=True, keep_punctuation=True, descr_prefix='d=', collapse_urls=True, collapse_mentions=True):
    """
    Convert a tweet into a list of tokens, from the tweet text and optionally the
    user description.
    
    Params:
        tweet.................The tweet that needs to be tokenized.
        user_descr............Boolean to indicate if we want to tokenize the user description too.
        lowercase.............Boolean indicating if we want the text to be convert to lowercase.
        keep_punctuation......Boolean indicating if we want to keep punctuation
        descr_prefix..........Prefix to add to the tokenization of the description.
        collapse_urls.........Boolean indicating if we ant to collapse the urls in the text. (e.g. @something)
        collapse_meentions....Boolean indicating if we ant to collapse the mmentions in the text. (e.g. #smth)
    """
    # When tokenizing the text, do not add any prefix.
    tokens = tokenize(tweet['text'], lowercase, keep_punctuation, None, collapse_urls, collapse_mentions)
    if use_descr:
        tokens.extend(tokenize(tweet['user']['description'], lowercase, keep_punctuation, descr_prefix,
                               collapse_urls, collapse_mentions))
    return tokens

In [19]:
tweet2tokens(test_tweet)

['THIS_IS_A_MENTION',
 'it',
 'is,',
 'my',
 'mother',
 'made',
 'me',
 'watch',
 'it',
 '1001',
 'times...if',
 'they',
 'ever',
 'remake',
 'it',
 'you',
 'should',
 'try',
 'out',
 'for',
 'the',
 'part',
 'd=i',
 'd=am',
 'd=by',
 'd=all',
 'd=means',
 'd=a',
 'd=tenacious',
 'd=individual!']

In [20]:
# for enumerating all possible arguments of tweet2tokens
# https://docs.python.org/2/library/itertools.html#itertools.product
from itertools import product

use_descr_opts = [True, False]
lowercase_opts = [True, False]
keep_punctuation_opts = [True, False]
descr_prefix_opts = ['d=', '']
url_opts = [True, False]
mention_opts = [True, False]

argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']
option_iter = product(use_descr_opts, lowercase_opts,
                       keep_punctuation_opts,
                       descr_prefix_opts, url_opts,
                       mention_opts)
for options in option_iter:
    print('  '.join('%s=%s' % (name, opt) 
                    for name, opt in zip(argnames, options)))
    print
    print('  '.join(tweet2tokens(test_tweet, *options)), '\n----\n')

use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=True
THIS_IS_A_MENTION  it  is,  my  mother  made  me  watch  it  1001  times...if  they  ever  remake  it  you  should  try  out  for  the  part  d=i  d=am  d=by  d=all  d=means  d=a  d=tenacious  d=individual! 
----

use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=False
@jtthepodcaster  it  is,  my  mother  made  me  watch  it  1001  times...if  they  ever  remake  it  you  should  try  out  for  the  part  d=i  d=am  d=by  d=all  d=means  d=a  d=tenacious  d=individual! 
----

use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=True
THIS_IS_A_MENTION  it  is,  my  mother  made  me  watch  it  1001  times...if  they  ever  remake  it  you  should  try  out  for  the  part  d=i  d=am  d=by  d=all  d=means  d=a  d=tenacious  d=individual! 
----

use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=False
@jtthepodcaster  it  is,  my  mother  made  me  watch  it  1

In [22]:
# Let's tokenize all tweets.
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=False, descr_prefix='d=',
                            collapse_urls=True, collapse_mentions=True)
              for t in tweets]
tokens_list[1]

['THIS_IS_A_MENTION',
 'it',
 'is',
 'my',
 'mother',
 'made',
 'me',
 'watch',
 'it',
 '1001',
 'times',
 'if',
 'they',
 'ever',
 'remake',
 'it',
 'you',
 'should',
 'try',
 'out',
 'for',
 'the',
 'part',
 'd=i',
 'd=am',
 'd=by',
 'd=all',
 'd=means',
 'd=a',
 'd=tenacious',
 'd=individual']

In [23]:
# Store these in a sparse matrix.

#1) Create a vocabulary (dict from term->index)

# https://docs.python.org/2/library/collections.html#collections.defaultdict
from collections import defaultdict

def make_vocabulary(tokens_list):
    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.
    for tokens in tokens_list:
        for token in tokens:
            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.
    print('%d unique terms in vocabulary' % len(vocabulary))
    return vocabulary

In [24]:
vocabulary = make_vocabulary(tokens_list)
# term->index
list(vocabulary.items())[:10]

20426 unique terms in vocabulary


[('told', 0),
 ('myself', 1),
 ('i', 2),
 ('was', 3),
 ('gonna', 4),
 ('get', 5),
 ('healthy', 6),
 ('but', 7),
 ('m', 8),
 ('at', 9)]

In [25]:
# How big is vocabulary if we keep punctuation?
tokens_list = [tweet2tokens(t, use_descr=True, lowercase=True,
                            keep_punctuation=True, descr_prefix='d=',
                            collapse_urls=True, collapse_mentions=True)
              for t in tweets]

vocabulary = make_vocabulary(tokens_list)

30498 unique terms in vocabulary


#### 2.1) Convert tokenized tweets into CSR matrix

In [26]:
# Convert features to a sparse matrix X.
# X[i,j] is the frequency of term j in tweet i
# 
from scipy.sparse import lil_matrix

def make_feature_matrix(tokens_list, vocabulary):
    X = lil_matrix((len(tweets), len(vocabulary)))
    for i, tokens in enumerate(tokens_list):
        for token in tokens:
            j = vocabulary[token]
            X[i,j] += 1
    return X.tocsr()  # convert to CSR for more efficient random access.

In [27]:
X = make_feature_matrix(tokens_list, vocabulary)
print('shape of X:', X.shape)

shape of X: (5000, 30498)


In [32]:
# How is tweet stored?
X[1]

<1x30498 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [33]:
# non-zero indices of terms used in tweet 1.
X[1].nonzero()[1]  # col_ind

array([13, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46], dtype=int32)

In [34]:
vocabulary

defaultdict(<function __main__.make_vocabulary.<locals>.<lambda>()>,
            {'told': 0,
             'myself': 1,
             'i': 2,
             'was': 3,
             'gonna': 4,
             'get': 5,
             'healthy': 6,
             'but': 7,
             'i’m': 8,
             'at': 9,
             'whataburger': 10,
             'rn': 11,
             '😌': 12,
             'd=i': 13,
             'd=prefer': 14,
             'd=to': 15,
             'd=get': 16,
             'd=high': 17,
             'd=on': 18,
             'd=life': 19,
             'THIS_IS_A_MENTION': 20,
             'it': 21,
             'is,': 22,
             'my': 23,
             'mother': 24,
             'made': 25,
             'me': 26,
             'watch': 27,
             '1001': 28,
             'times...if': 29,
             'they': 30,
             'ever': 31,
             'remake': 32,
             'you': 33,
             'should': 34,
             'try': 35,
             'out

In [35]:
# term counts for tweet 1.
X[1].data  # "val"

array([1., 1., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [46]:
# What word does each term index correspond to?
# Convert term->index dict into index->term dict
index2term = {i: t for t, i in vocabulary.items()}
print(index2term[2])
print(X[0, 2])
# So, the term "the" (index 15) appears in user 200's tweet one time

i
1.0


In [48]:
print('tweet 1 starts at col_ind=', X.indptr[1])
print('tweet 2 starts at col_ind=', X.indptr[2])
print('so, the columns that are non-zero for tweet 1 are:')
print(X.indices[X.indptr[1]:X.indptr[2]])
print('and the data associated with those cells are:')
print(X.data[X.indptr[1]:X.indptr[2]])

tweet 1 starts at col_ind= 20
tweet 2 starts at col_ind= 48
so, the columns that are non-zero for tweet 1 are:
[13 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
 43 44 45 46]
and the data associated with those cells are:
[1. 1. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1.]


In [49]:
print('tweet 0:\n', X[0], '\n')
print('tweet 1:\n', X[1], '\n')
print('tweet 2:\n', X[2])

tweet 0:
   (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (0, 11)	1.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	1.0 

tweet 1:
   (0, 13)	1.0
  (0, 20)	1.0
  (0, 21)	3.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	1.0
  (0, 26)	1.0
  (0, 27)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 30)	1.0
  (0, 31)	1.0
  (0, 32)	1.0
  (0, 33)	1.0
  (0, 34)	1.0
  (0, 35)	1.0
  (0, 36)	1.0
  (0, 37)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 40)	1.0
  (0, 41)	1.0
  (0, 42)	1.0
  (0, 43)	1.0
  (0, 44)	1.0
  (0, 45)	1.0
  (0, 46)	1.0 

tweet 2:
   (0, 20)	1.0
  (0, 37)	1.0
  (0, 47)	1.0
  (0, 48)	1.0
  (0, 49)	1.0
  (0, 50)	1.0
  (0, 51)	1.0
  (0, 52)	1.0
  (0, 53)	1.0


In [50]:
# Compute z = X * \beta, where X is a CSR matrix.
import numpy as np
beta = np.ones(len(vocabulary))  # assume Beta = vector of 1s
z = np.zeros(len(tweets))
for i in range(len(tweets)):  # for each row.
    for j in range(X.indptr[i], X.indptr[i+1]): # for each col.
        colidx = X.indices[j]
        z[i] += beta[colidx] * X.data[j]
print('X * beta for tweet 1=', z[1])
print('which is the same as the sum %.1f, since beta=[1...1]' %
      X[1].sum())

X * beta for tweet 1= 30.0
which is the same as the sum 30.0, since beta=[1...1]


### 3) Create a list of gender labels

In [51]:
# y is a 1d numpy array of gender labels.
# Let 1=Female, 0=Male.
import numpy as np

def get_gender(tweet, male_names, female_names):
    name = get_first_name(tweet)
    if name in female_names:
        return 1
    elif name in male_names:
        return 0
    else:
        return -1
    
y = np.array([get_gender(t, male_names, female_names) for t in tweets])
print('gender labels:', Counter(y))

gender labels: Counter({0: 2963, 1: 2037})


### 5) Fit a Logistic Regression classifier to predict gender from profile/tweet

In [53]:
# Do 5-fold cross-validation
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def do_cross_val(X, y, nfolds):
    """ Compute average cross-validation acccuracy."""
    cv = KFold(n_splits=nfolds, random_state=42, shuffle=True)
    accuracies = []
    for train_idx, test_idx in cv.split(X):
        clf = LogisticRegression()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        accuracies.append(acc)
    avg = np.mean(accuracies)
    print(np.std(accuracies))
    print(accuracies)
    return avg

In [54]:
print('avg accuracy', do_cross_val(X, y, 5))



0.011822013364905331
[0.719, 0.73, 0.753, 0.745, 0.734]
avg accuracy 0.7362


In [55]:
# How does tokenization affect accuracy?
# Collapse urls and mentions; ignore description prefix.
def run_all(tweets, use_descr=True, lowercase=True,
            keep_punctuation=True, descr_prefix=None,
            collapse_urls=True, collapse_mentions=True):
    
    tokens_list = [tweet2tokens(t, use_descr, lowercase,
                            keep_punctuation, descr_prefix,
                            collapse_urls, collapse_mentions)
                  for t in tweets]
    vocabulary = make_vocabulary(tokens_list)
    X = make_feature_matrix(tokens_list, vocabulary)
    acc = do_cross_val(X, y, 5)
    print('acc=', acc)
    return acc

In [56]:
argnames = ['use_descr', 'lower', 'punct', 'prefix', 'url', 'mention']
option_iter = product(use_descr_opts, lowercase_opts,
                       keep_punctuation_opts,
                       descr_prefix_opts, url_opts,
                       mention_opts)
results = []
for options in option_iter:
    option_str = '\t'.join('%s=%s' % (name, opt) for name, opt
                           in zip(argnames, options))
    print(option_str)
    acc = run_all(tweets, *options)
    results.append((acc, options))
    print

use_descr=True	lower=True	punct=True	prefix=d=	url=True	mention=True
30498 unique terms in vocabulary
0.011822013364905331
[0.719, 0.73, 0.753, 0.745, 0.734]
acc= 0.7362
use_descr=True	lower=True	punct=True	prefix=d=	url=True	mention=False
35285 unique terms in vocabulary
0.01149956520917205
[0.725, 0.742, 0.757, 0.748, 0.731]
acc= 0.7406
use_descr=True	lower=True	punct=True	prefix=d=	url=False	mention=True
32522 unique terms in vocabulary
0.011391224692718523
[0.721, 0.732, 0.754, 0.746, 0.736]
acc= 0.7378
use_descr=True	lower=True	punct=True	prefix=d=	url=False	mention=False
37309 unique terms in vocabulary
0.011855800268223155
[0.722, 0.744, 0.754, 0.751, 0.733]
acc= 0.7407999999999999
use_descr=True	lower=True	punct=True	prefix=	url=True	mention=True
26767 unique terms in vocabulary
0.012611106216347569
[0.721, 0.721, 0.755, 0.729, 0.736]
acc= 0.7323999999999999
use_descr=True	lower=True	punct=True	prefix=	url=True	mention=False
31493 unique terms in vocabulary
0.012354756169184411

14793 unique terms in vocabulary
0.012815615474880646
[0.61, 0.623, 0.645, 0.624, 0.61]
acc= 0.6224000000000001
use_descr=False	lower=False	punct=True	prefix=d=	url=True	mention=True
15976 unique terms in vocabulary
0.00917387595294378
[0.617, 0.615, 0.639, 0.633, 0.625]
acc= 0.6258
use_descr=False	lower=False	punct=True	prefix=d=	url=True	mention=False
19579 unique terms in vocabulary
0.0037094473981982845
[0.628, 0.628, 0.638, 0.632, 0.63]
acc= 0.6312
use_descr=False	lower=False	punct=True	prefix=d=	url=False	mention=True
17792 unique terms in vocabulary
0.009579144011862446
[0.613, 0.617, 0.636, 0.636, 0.629]
acc= 0.6262000000000001
use_descr=False	lower=False	punct=True	prefix=d=	url=False	mention=False
21395 unique terms in vocabulary
0.007110555533852477
[0.623, 0.623, 0.641, 0.635, 0.634]
acc= 0.6312
use_descr=False	lower=False	punct=True	prefix=	url=True	mention=True
15976 unique terms in vocabulary
0.00917387595294378
[0.617, 0.615, 0.639, 0.633, 0.625]
acc= 0.6258
use_descr=F

In [58]:
for r in sorted(results, reverse=True):
    print('%.4f' % r[0], '  '.join('%s=%s' % (name, opt) for name, opt in zip(argnames, r[1])))

0.7482 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=False
0.7478 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=False
0.7472 use_descr=True  lower=True  punct=False  prefix=d=  url=True  mention=True
0.7472 use_descr=True  lower=True  punct=False  prefix=d=  url=False  mention=True
0.7458 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=True
0.7450 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=False
0.7448 use_descr=True  lower=False  punct=False  prefix=d=  url=True  mention=False
0.7428 use_descr=True  lower=False  punct=False  prefix=d=  url=False  mention=True
0.7428 use_descr=True  lower=True  punct=False  prefix=  url=False  mention=False
0.7414 use_descr=True  lower=True  punct=False  prefix=  url=True  mention=False
0.7408 use_descr=True  lower=True  punct=True  prefix=d=  url=False  mention=False
0.7406 use_descr=True  lower=True  punct=True  prefix=d=  url=True  mention=False
0.74

In [59]:
idx2word = dict((v,k) for k,v in vocabulary.items())

In [60]:
# Fit model on all data and print top coef.
model = LogisticRegression()
model.fit(X,y)
# Get the learned coefficients for the Positive class.
coef = model.coef_[0]
# Sort them in descending order.
top_coef_ind = np.argsort(coef)[::-1][:20]
# Get the names of those features.
top_coef_terms = [idx2word[i] for i in top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Print the top 10.
print('top weighted terms for female class:')
print('\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))

# repeat for males
top_coef_ind = np.argsort(coef)[:20]
top_coef_terms = [idx2word[i] for i in top_coef_ind]
top_coef = coef[top_coef_ind]
print('\ntop weighted terms for male class:')
print('\n'.join(str(x) for x in zip(top_coef_terms, top_coef)))

top weighted terms for female class:
('d=mom', 1.7476924957812148)
('d=mom,', 1.5884129128188123)
('d=she/her', 1.5181028804416337)
('d=alumna', 1.4089658690648104)
('d=mother,', 1.2720610498952378)
('d=✨', 1.2122067776386518)
('d=our', 1.1858025175306444)
('d=mom.', 1.1252789076472065)
('app', 1.1037215889944685)
('d=mother', 1.0590583603889652)
('d=loving', 1.0241080750841511)
('d=she/her.', 1.017781124891941)
('today!', 1.009349802768622)
('told', 1.0050504694343796)
('d=woman', 0.9628902712622512)
('d=:', 0.9560020033356396)
('d=girl', 0.949903951512294)
('d=trump', 0.9178808364570152)
('job', 0.9171617350606579)
('—', 0.9153969401724443)

top weighted terms for male class:
('d=father', -1.8877512892583566)
('d=husband,', -1.3502937912948538)
('d=dad', -1.3249415655807995)
('d=husband', -1.26194956817943)
('d=l', -1.1340953170408035)
('d=sports', -1.0956303657023583)
('birthday', -1.0859370404484419)
('d=man', -1.0840524463674461)
('d=former', -1.0689713118452118)
('d=guy', -1.0446