In [1]:
import os
import io
import json

import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

# Hyperparameters

In [None]:
output_dir = 'model_output/fandom'

embedding_dim = 64
max_features=10000
maxlen=300 

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load Data from Disk

In [None]:
#Load the parquet data frame... nothing that special here

filepath = os.path.abspath(os.path.join(os.getcwd(), 
        "..", 
        "data/fandom_categorizer/downloads/scraped/master_8-26_expanddata_3468.parquet"))

raw_df = pd.read_parquet(filepath)
    
#append to dataframe
raw_df.head()

# Setup Labels (IAB Categories, Y)

In [None]:
y = [list(v.values()) for k,v in raw_df.tags.iteritems()]
print(y[0]) # our total list of labels
print(len(y)) # the total number of labeled sites
num_categories = len(y[0]) # the number of labels/IAB categories

# Setup "Word Bags" (X)

In [None]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) 

In [None]:
tok.fit_on_texts(list(raw_df.text)) 

In [None]:
print(len(tok.word_index))
with io.open(output_dir+'/tokenizer.json','w',encoding='utf-8') as f:
    f.write(json.dumps(tok.to_json(),ensure_ascii=False))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

In [None]:
train_df = tok.texts_to_sequences(list(raw_df.text)) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step
train_df[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

# Model and Train

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(2000,activation='relu'),
  tf.keras.layers.Dense(500,activation='relu'),
  tf.keras.layers.Dense(100,activation='relu'),
  tf.keras.layers.Dense(num_categories, activation=tf.nn.sigmoid)
])

model.compile(optimizer='nadam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath=output_dir+
                                  "/weights.{epoch:02d}.hdf5")

model.summary()

In [None]:
model.fit(np.array(X_train), np.array(y_train),
          #batch_size=128,
          validation_data=(np.array(X_test),np.array(y_test)),
          epochs=20,
          callbacks=[modelcheckpoint])

In [None]:
score = model.evaluate(np.array(X_test), np.array(y_test)) 

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

# Accuracy by Label

In [None]:
model.load_weights(output_dir+"/weights.10.hdf5") #NOT ZERO INDEXED

In [None]:
filepath = os.path.abspath(os.path.join(os.getcwd(), 
            "..",
            "data/fandom_categorizer/fandom_tags.json"))
with open(filepath) as f:
    label_dict = json.load(f)

results = {}
    
for k,v in label_dict.items():
    results[k]={
        'Correct':0,
        'Wrong-OverTagged':0,
        'Wrong-UnderTagged':0,
        'Error':0,
        'Count':0,
        }

predictions = model.predict(X_train)

for i in range(len(y_train)):
    j=0
    for k,v in label_dict.items():
        p = 1 if predictions[i][j] > 0.90 else 0
        a = y[i][j]
        if p >= .90 and a == 1 or p == 0 and a == 0:
            results[k]['Correct'] += 1
        elif p >= .90 and a == 0:
            results[k]['Wrong-OverTagged'] += 1
        elif p < .90 and a == 1:
            results[k]['Wrong-UnderTagged'] += 1
        else:
            results[k]['Error'] += 1
        results[k]['Count'] += 1    
        j+=1

pred_df = pd.DataFrame(data=results)
print(pred_df)
pred_df.to_csv(output_dir+"/label-accuracy.csv")

# Load/Save Model

In [None]:
model.save(output_dir+'/model_v003')