In [1]:
import os
import io
import json

import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

In [3]:
strategy = tf.distribute.MirroredStrategy()
num_replicas = strategy.num_replicas_in_sync
print('Number of devices: {}'.format(num_replicas))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


# Hyperparameters

In [4]:
output_dir = f"{os.getcwd()}/model_output/caption_to_category"

num_epochs=5
batch_size_per_replica=512
batch_size = batch_size_per_replica * num_replicas

embedding_dim = 512 
max_features=5000 #we set maximum number of words to 5000
maxlen=300 #we set maximum sequence length to 300

# Load Data from Disk

In [5]:
#Load the parquet data frame... nothing that special here just filename hackery
file_suffix = []
for i in range(14):
    file_suffix.append(str((i+1)*2000))
   
output_array = []
for f in file_suffix:
    filepath = os.path.abspath(os.path.join(os.getcwd(), 
            "..", 
            "data/url_to_category/downloads/preprocessed/master_6-11_"+f+".parquet"))

    file_df = pd.read_parquet(filepath)
    output_array.append(file_df)

raw_df = pd.concat(output_array)
    
#append to dataframe
raw_df.head()

Unnamed: 0_level_0,url,urlkey,text,iab
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,youtube.com,"com,youtube)/","[youtube, skip, navigation, sign, search, home...","{'IAB1': 0.9334222620816729, 'IAB1-1': 0, 'IAB..."
1,youtube.com,"com,youtube)/","[new, york, city, digs, mass, graves, potter, ...","{'IAB1': 0.9334222620816729, 'IAB1-1': 0, 'IAB..."
2,youtube.com,"com,youtube)/","[ago, vs, pork, chop, dinner, pro, chef, home,...","{'IAB1': 0.9334222620816729, 'IAB1-1': 0, 'IAB..."
3,youtube.com,"com,youtube)/1027kiisfm","[hit, music, station, listen, live, http, skip...","{'IAB1': 0.9334222620816729, 'IAB1-1': 0, 'IAB..."
4,youtube.com,"com,youtube)/1027kiisfm","[iheartawards, nomination, tattoos, duration, ...","{'IAB1': 0.9334222620816729, 'IAB1-1': 0, 'IAB..."


# Setup Labels (IAB Categories, Y)

In [6]:
y = [list(v.values()) for k,v in raw_df.iab.iteritems()]
print(y[0]) # our total list of labels
print(len(y)) # the total number of labeled sites
num_categories = len(y[0]) # the number of labels/IAB categories

[0.9334222620816729, 0, 0, 0, 0, 0, 0, 0.9314065434229816, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Setup "Word Bags" (X)

In [7]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) 

In [8]:
tok.fit_on_texts(list(raw_df.text)) 

In [9]:
print(len(tok.word_index))
with io.open(output_dir+'/tokenizer.json','w',encoding='utf-8') as f:
    f.write(json.dumps(tok.to_json(),ensure_ascii=False))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

314667


In [10]:
train_df = tok.texts_to_sequences(list(raw_df.text)) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step
train_df[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  608,  364,
        390,   25,   17,    6,  834,    6,  834,  246,    3,  608,  335,
          3,  608,  178,   11,  608,  210,  151,  631,  562,  178,  564,
          9,   72,  442,  222, 3066,   34,  474, 1040,   25,   35,  100,
        835,  326,   25, 1033, 2438,  328,  845,   11,   99,   56,  435,
        328,  156,  416,  133,  416,  155,  155, 1142,  195,   34,  640,
       1033, 1193,  155, 2989, 1961, 1385, 1024, 1224, 2153, 3766, 1134,
        343, 1344,   57,   33, 2153,   35,   33, 2114,  343,   77,   57,
         33, 2982, 2965, 1886, 2153,   35,   33, 21

In [35]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=42)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

# Model and Train

In [36]:
with strategy.scope():
    model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding(input_dim=vocab_size,
                               output_dim=embedding_dim,
                               input_length=maxlen),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(num_categories, activation=tf.nn.softmax)

    ])
    
    model.compile(optimizer='nadam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [37]:
modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath=output_dir+
                                  "/weights.{epoch:02d}.hdf5")

In [38]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [39]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 512)          161110016 
_________________________________________________________________
flatten_1 (Flatten)          (None, 153600)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 403)               61901203  
Total params: 223,011,219
Trainable params: 223,011,219
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(train_dataset,
          validation_data=(test_dataset),
          epochs=20,
          callbacks=[modelcheckpoint])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f6e806ee8d0>

In [42]:
model.load_weights(output_dir+"/weights.12.hdf5") # NOT zero-indexed

In [43]:
score = model.evaluate(test_dataset) 

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.005531344562768936
Test Accuracy: 0.7150166034698486


# Test with Handmade Text

In [44]:
new_text = ["""
American Physiology Reviews, January 2020
Easily Growing Additional Fingers
The Essential Scientific Guide to Contemporary Finger Generation and Re-generation
"""]

In [45]:
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
print(test_text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) #let's execute pad step

[[366, 129, 764, 758, 1181, 646, 1044, 2492, 135, 3836, 1652, 1652]]


In [46]:
filepath = os.path.abspath(os.path.join(os.getcwd(), 
            "..", 
            "data/url_to_category/downloads/IAB/iab.json"))
with open(filepath) as f:
    full_iab_dict = json.load(f)

In [47]:
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)

results = []

i = 0
for k,v in full_iab_dict.items():
    results.append({
        'IAB':k,
        'Description':v,
        'Prediction':predictions[0][i]
        })
    i+=1
    
pred_df = pd.DataFrame(data=results)

print(pred_df.sort_values(by='Prediction', ascending=False))

          IAB             Description  Prediction
203     IAB13        Personal Finance    0.239017
296     IAB19  Technology & Computing    0.060205
245     IAB17                  Sports    0.042721
83       IAB7        Health & Fitness    0.041792
149      IAB9     Hobbies & Interests    0.041486
..        ...                     ...         ...
28    IAB2-20                   Sedan    0.000026
320  IAB19-24       Net for Beginners    0.000026
383   IAB23-9       Latter-Day Saints    0.000026
255  IAB17-10          Figure Skating    0.000026
39     IAB3-7              Government    0.000026

[403 rows x 3 columns]


# Load/Save Model

In [49]:
model.save(output_dir+'/model_v002')

INFO:tensorflow:Assets written to: /clarus-ai/train/model_output/caption_to_category/model_v002/assets
