In [1]:
import os
import io
import json

import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

# Hyperparameters

In [2]:
output_dir = 'model_output/fandom'

embedding_dim = 64
max_features=10000
maxlen=300 

In [3]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load Data from Disk

In [4]:
#Load the parquet data frame... nothing that special here

filepath = os.path.abspath(os.path.join(os.getcwd(), 
        "..", 
        "data/fandom_categorizer/downloads/scraped/master_8-26_expanddata_3468.parquet"))

raw_df = pd.read_parquet(filepath)
    
#append to dataframe
raw_df.head()

Unnamed: 0,url,text,tags
0,elfen-lied.fandom.com,"[elfen, lied, wikiwelcome, currently, maintain...","{'3rdpersonshooter': 0, 'abc': 0, 'action': 1,..."
1,elfen-lied.fandom.com,"[including, kurama, secretary, read, featured,...","{'3rdpersonshooter': 0, 'abc': 0, 'action': 1,..."
2,warhammer40kfanon.fandom.com,"[warhammer, wikidisclaimer, adeptus, astartes,...","{'3rdpersonshooter': 0, 'abc': 0, 'action': 0,..."
3,ayakashi-ghost-guild.fandom.com,"[ayakashi, ghost, guild, onmyouroku, wikilates...","{'3rdpersonshooter': 0, 'abc': 0, 'action': 0,..."
4,utaite.fandom.com,"[utaite, wikiplease, read, start, editing, wel...","{'3rdpersonshooter': 0, 'abc': 0, 'action': 0,..."


# Setup Labels (IAB Categories, Y)

In [5]:
y = [list(v.values()) for k,v in raw_df.tags.iteritems()]
print(y[0]) # our total list of labels
print(len(y)) # the total number of labeled sites
num_categories = len(y[0]) # the number of labels/IAB categories

[0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3965


# Setup "Word Bags" (X)

In [6]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) 

In [7]:
tok.fit_on_texts(list(raw_df.text)) 

In [8]:
print(len(tok.word_index))
with io.open(output_dir+'/tokenizer.json','w',encoding='utf-8') as f:
    f.write(json.dumps(tok.to_json(),ensure_ascii=False))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

78446


In [9]:
train_df = tok.texts_to_sequences(list(raw_df.text)) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step
train_df[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 4524, 4013,  267,   60, 1510,   12,
        199,  103,  212,   11,    1,   91,  155,  236,  139, 1278,   76,
        139,  247, 5336,   43, 4524, 4013,    9, 4624,  338,  146,   58,
         40,  115,   11, 3070,   15,  222,  392,  131,  602, 3848,    6,
        262,    6, 3441,  199,  108,   11,   11,  336, 2299, 3940,   14,
       4524, 4013,    1,    6,   13,    2,    3, 4524, 4013,  703, 7906,
       7036, 9059, 7037, 4525,  810,  536,  536,   65,  944, 4526,  196,
       8271,  150,  933,  114,  980,  129,   93, 1068,  322, 2037, 3941,
       1873, 3547,    5,  155,  302,  189, 1269,   74, 1343,   74,  223,
         74,   45,   74,  139,   74,   82,    3,   74,   17,    3,   74,
       1060,  236,   76,  715,  443,  336,  140,    1,  299,   11,    6,
       2367,    6,  733, 1812,  262,    6,   19,   

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

# Model and Train

In [12]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(2000,activation='relu'),
  tf.keras.layers.Dense(500,activation='relu'),
  tf.keras.layers.Dense(100,activation='relu'),
  tf.keras.layers.Dense(num_categories, activation=tf.nn.sigmoid)
])

model.compile(optimizer='nadam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath=output_dir+
                                  "/weights.{epoch:02d}.hdf5")

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 64)           5020608   
_________________________________________________________________
flatten (Flatten)            (None, 19200)             0         
_________________________________________________________________
dense (Dense)                (None, 2000)              38402000  
_________________________________________________________________
dense_1 (Dense)              (None, 500)               1000500   
_________________________________________________________________
dense_2 (Dense)              (None, 100)               50100     
_________________________________________________________________
dense_3 (Dense)              (None, 61)                6161      
Total params: 44,479,369
Trainable params: 44,479,369
Non-trainable params: 0
____________________________________________

In [None]:
model.fit(np.array(X_train), np.array(y_train),
          #batch_size=128,
          validation_data=(np.array(X_test),np.array(y_test)),
          epochs=20,
          callbacks=[modelcheckpoint])

In [None]:
score = model.evaluate(np.array(X_test), np.array(y_test)) 

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

# Accuracy by Label

In [13]:
model.load_weights(output_dir+"/weights.10.hdf5") #NOT ZERO INDEXED

In [24]:
filepath = os.path.abspath(os.path.join(os.getcwd(), 
            "..",
            "data/fandom_categorizer/fandom_tags.json"))
with open(filepath) as f:
    label_dict = json.load(f)

results = {}
    
for k,v in label_dict.items():
    results[k]={
        'Correct':0,
        'Wrong-OverTagged':0,
        'Wrong-UnderTagged':0,
        'Error':0,
        'Count':0,
        }

predictions = model.predict(X_train)

for i in range(len(y_train)):
    j=0
    for k,v in label_dict.items():
        p = 1 if predictions[i][j] > 0.90 else 0
        a = y[i][j]
        if p >= .90 and a == 1 or p == 0 and a == 0:
            results[k]['Correct'] += 1
        elif p >= .90 and a == 0:
            results[k]['Wrong-OverTagged'] += 1
        elif p < .90 and a == 1:
            results[k]['Wrong-UnderTagged'] += 1
        else:
            results[k]['Error'] += 1
        results[k]['Count'] += 1    
        j+=1

pred_df = pd.DataFrame(data=results)
print(pred_df)
pred_df.to_csv(output_dir+"/label-accuracy.csv")

                   3rdpersonshooter   abc  action  adventure    ae  amazon  \
Correct                        2640  3392    2132       1867  3203    3448   
Wrong-OverTagged                459    77    1430        976   186      45   
Wrong-UnderTagged               469    99       6        725   179      75   
Error                             0     0       0          0     0       0   
Count                          3568  3568    3568       3568  3568    3568   

                    amc  anime  bbcamerica  books  ...  showtime   sim  \
Correct            3459   1806        3477   2098  ...      3492  2914   
Wrong-OverTagged     48    877          38    667  ...        35   291   
Wrong-UnderTagged    61    885          53    803  ...        41   363   
Error                 0      0           0      0  ...         0     0   
Count              3568   3568        3568   3568  ...      3568  3568   

                   sports  strategy  syfy  thriller    tv   war   web  western  
Corre

# Load/Save Model

In [None]:
model.save(output_dir+'/model_v003')