In [4]:
MODEL_FILE = 'TwitterInferredGender.h5'
MODEL_DIR = '.'
EMBEDDING_DIM = 512
DOWNLOAD_MODEL = False  # set to false if re-running without runtime restart 
INSTALL_TF = False

In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import tensorflow_hub as hub
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Layer
import tensorflow.keras.layers as layers
from tensorflow.keras.optimizers import Adam, Adagrad

In [6]:
class USEEmbeddingLayer(Layer):
    def __init__(self, trainable=True, **kwargs):
        self.dimensions = EMBEDDING_DIM
        super(USEEmbeddingLayer, self).__init__(**kwargs)
        self.trainable=trainable

    def build(self, input_shape):
        self.use = hub.Module('https://tfhub.dev/google/universal-sentence-encoder-large/3', trainable=True,
                               name="{}_module".format(self.name))
        wts = tf.trainable_variables(scope=".*{}_module/.*".format(self.name))
        wts = [v for v in wts if not ("SNLI" in v.name or "SHARED_RANK_ANSWER" in v.name)]
        if len(wts)>0:
          if self.trainable:
            self._trainable_weights += wts
          else:
            self._non_trainable_weights += wts
        else:
          print('WARNING: No weights in Embedding Layer')
        super(USEEmbeddingLayer, self).build(input_shape)

   
    def call(self, x, mask=None):
        result = self.use(tf.squeeze(tf.cast(x, tf.string), axis=1))
        return result

    def compute_mask(self, inputs, mask=None):
        return tf.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [7]:
def build_model(): 
  input_text = layers.Input(shape=(1,), dtype="string")
  embedding = USEEmbeddingLayer(trainable=False)(input_text)
  dense1 = layers.Dense(512, activation='relu')(embedding)

  dropout0 = layers.Dropout(0)(dense1)
  pred = layers.Dense(1, activation='sigmoid')(dropout0)

  model = Model(inputs=[input_text], outputs=pred)

  return model

In [8]:
tf.__version__

'1.13.1'

In [9]:
%%time
model = build_model()

Instructions for updating:
Colocations handled automatically by placer.


W0204 23:03:12.715416 16400 deprecation.py:323] From C:\Users\Ryloid\Anaconda3\envs\U4-S3-DNN\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0204 23:03:13.731699 16400 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0204 23:03:13.844398 16400 deprecation.py:506] From C:\Users\Ryloid\Anaconda3\envs\U4-S3-DNN\lib\site-packages\tensorflow\python\keras\layers\core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Wall time: 2.02 s


In [10]:
init = tf.global_variables_initializer()
table_init = tf.tables_initializer()
sess = tf.Session()
sess.run([init, table_init])
K.set_session(sess)

model_path = os.path.join(MODEL_DIR, MODEL_FILE)
model.load_weights(model_path)

In [11]:
def sex(s):  # Calculate estimated probability that tweeter is male
    it = model.predict(np.array([s]).reshape(-1,1))[0][0]
    return(it)

In [None]:
def get_scores(sentences):
    results = []
    for i in range(len(sentences)): 
        if sex(sentences[i]) > 0.5:
            results.append(sex(sentences[i]))
        elif sex(sentences[i]) < 0.5:
            results.append(-sex(sentences[i]))
    return results

def get_sum(scores):
    result = round(sum(scores), 3)
    return result

In [2]:
import pandas as pd 
df = pd.read_json('10000_predictions_extra.json')

In [3]:
df.head()

Unnamed: 0,id,by,text,sentences,scores,scores_sum,sex,sex_sum
0,3635676,JoeAltmaier,"So now I do the legwork for articles, instead ...","[So now I do the legwork for articles, instead...","[-1.0, 0.89, -1.0, -0.9560000000000001, 0.788,...",0.735,"[0.6119707227, 0.5237882137000001, 0.617572486...",4.425
1,7235858,pessimizer,Even more clarification: never talk to police....,[Even more clarification: never talk to police...,"[0.993, 0.925, 0.676]",2.594,"[-0.49952897430000004, 0.5629577637000001, 0.6...",0.687
10,6375636,pessimizer,SSRIs are also like placebos in that they hav...,[SSRIs are also like placebos in that they ha...,[0.581],0.581,[0.5725314021],0.573
100,322984,ivankirigin,He's saying it has been done.,[He's saying it has been done.],[0.997],0.997,[0.6127984524],0.613
1000,2258802,__david__,It looks like you would have to implement USB...,[It looks like you would have to implement USB...,"[-1.0, -0.973]",-1.973,"[0.7784828544, 0.5308563709]",1.309


In [14]:
%% time # about 30 minutes 

df['sex'] = df['sentences'].apply(get_scores)

In [15]:
df['sex_sum'] = df.sex.apply(get_sum)

In [9]:
df.groupby('id').mean().sort_values(by='scores_sum', ascending=True)[:10]

Unnamed: 0_level_0,scores_sum,sex_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3022411,-10.046,19.309
330212,-9.069,11.014
49709,-8.787,16.762
7579487,-7.98,7.26
4376220,-7.968,18.154
1781266,-7.813,5.637
2060677,-7.618,4.547
5305464,-7.577,10.648
1166606,-7.511,-0.191
6157105,-7.005,6.732


In [13]:
df.loc[1058].text

'Some entertaining bits from the article: "How many Mac applications still support OS X 10.2? 10.3? 10.4?" "Then why should we be so conservative when it comes to C++, the very core of what we work with?" "C++11 is a much more productive language than C++98. We as an industry can, however, only reap that productivity gain if we stop throwing C++ productivity out of the window by the bucket-load in the hopeless pursuit of compatibility with non-standard implementations." I\'m as much for iconoclasm as the next person, but this whole C++ madness is starting to make me worry. While the rest of us have been off in the trenches of maintenance and production, some cabal of academics and chuckleheads has been cooking up this monstrosity of a language, and now it is finally coming to a head. Folks, conservatism in software design is appreciated--in language design, essential. Python 3 is still not support by many useful libraries. Perl 5, over ten years later, is still in production and use. E