In [720]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [721]:
data_anime = pd.read_csv('./input/anime.csv')

data_anime = data_anime.filter(['genres', 'synopsis'])

data_anime = shuffle(data_anime, random_state = 23)

data_anime.head()

Unnamed: 0,genres,synopsis
401,"['Comedy', 'Ecchi', 'Harem', 'Otaku Culture', ...","Tomoya Aki, an otaku, has been obsessed with c..."
18048,['Kids'],"An educational anime part of the ""Save our mot..."
5632,"['Adventure', 'Fantasy']","The story takes place in Lingshan, where a gro..."
20971,['Military'],Monkeys battle polar bears in air combat. Sho...
10303,"['Action', 'Adventure', 'Sci-Fi']",A classic sci-fi TV anime. The adventures of s...


In [722]:
genres = [literal_eval(genre) for genre in data_anime['genres'].values]

genre_encoder = MultiLabelBinarizer()
genres_encoded = genre_encoder.fit_transform(genres)
num_genres = len(genres_encoded[0])
print(genre_encoder.classes_)
print(genres_encoded[0])




['Action' 'Adult Cast' 'Adventure' 'Anthropomorphic' 'Avant Garde'
 'Award Winning' 'Boys Love' 'CGDCT' 'Childcare' 'Combat Sports' 'Comedy'
 'Crossdressing' 'Delinquents' 'Detective' 'Drama' 'Ecchi' 'Educational'
 'Erotica' 'Fantasy' 'Gag Humor' 'Girls Love' 'Gore' 'Gourmet' 'Harem'
 'Hentai' 'High Stakes Game' 'Historical' 'Horror' 'Idols (Female)'
 'Idols (Male)' 'Isekai' 'Iyashikei' 'Josei' 'Kids' 'Love Polygon'
 'Magical Sex Shift' 'Mahou Shoujo' 'Martial Arts' 'Mecha' 'Medical'
 'Military' 'Music' 'Mystery' 'Mythology' 'Organized Crime'
 'Otaku Culture' 'Parody' 'Performing Arts' 'Pets' 'Psychological'
 'Racing' 'Reincarnation' 'Reverse Harem' 'Romance' 'Romantic Subtext'
 'Samurai' 'School' 'Sci-Fi' 'Seinen' 'Shoujo' 'Shounen' 'Showbiz'
 'Slice of Life' 'Space' 'Sports' 'Strategy Game' 'Super Power'
 'Supernatural' 'Survival' 'Suspense' 'Team Sports' 'Time Travel'
 'Vampire' 'Video Game' 'Visual Arts' 'Workplace']
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 

In [723]:
# Split our data into train and test sets
train_size = int(len(data_anime) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data_anime) - train_size))

Train size: 19209
Test size: 4803


In [724]:
# Split our labels into train and test sets
train_genres = genres_encoded[:train_size]
test_genres = genres_encoded[train_size:]

print(tf.version)

<module 'tensorflow._api.v2.version' from '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/tensorflow/_api/v2/version/__init__.py'>


In [725]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None
  
  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

Overwriting preprocess.py


In [726]:
def remove_phrase(strings, phrase):
    modified_strings = []
    for string in strings:
        modified_string = string.replace(phrase, "")
        modified_strings.append(modified_string)
    return modified_strings

In [727]:
# Create vocab from training corpus
from preprocess import TextPreprocessor

VOCAB_SIZE=900 # This is a hyperparameter, try out different values for your dataset

train_qs = data_anime['synopsis'].values[:train_size]

test_qs = data_anime['synopsis'].values[train_size:]

train_qs = [str(element) for element in train_qs]
test_qs = [str(element) for element in test_qs]

train_qs = remove_phrase(train_qs, "[Written by MAL Rewrite]")
test_qs = remove_phrase(test_qs, "[Written by MAL Rewrite]")

processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [728]:
# Preview the first input from our training data
print(len(body_train[0]))
print(body_train[0])

900
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0

Building and Training the Model

In [729]:
# Save the processor state of the tokenizer
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

In [730]:
def create_model(vocab_size, num_genres):
  
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(units=50, input_shape=(VOCAB_SIZE,), activation='relu'))
  model.add(tf.keras.layers.Dense(units=25, activation='relu'))
  model.add(tf.keras.layers.Dense(units=num_genres, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [735]:
model = create_model(VOCAB_SIZE, num_genres)
model.summary()
print(len(body_train))
# Train and evaluate the model
model.fit(body_train, train_genres, epochs=10, batch_size=32, validation_split=0.1)
print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_genres, batch_size=32)))

# Export the model to a file
model.save('keras_saved_model.h5')

Model: "sequential_63"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_187 (Dense)           (None, 50)                45050     
                                                                 
 dense_188 (Dense)           (None, 25)                1275      
                                                                 
 dense_189 (Dense)           (None, 76)                1976      
                                                                 
Total params: 48,301
Trainable params: 48,301
Non-trainable params: 0
_________________________________________________________________
19209
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Eval loss/accuracy:[0.09703369438648224, 0.3449927270412445]


In [732]:
%%writefile model_prediction.py
import pickle
import os
import numpy as np

class CustomModelPrediction(object):

  def __init__(self, model, processor):
    self._model = model
    self._processor = processor
  
  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.transform_text(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)

Overwriting model_prediction.py


In [737]:
test_requests = ["Despite being kind and considerate, Keitarou Itsuki has a menacing look in his eyes that scares away others. When Keitarou attempts to confess to Aoi Tokujira, she flatly rejects him, leading him to believe that his intimidating eyes are to blame. Confiding his worries to his older sister, she gives him a hand by offering a complete makeover. But much to Keitarou's surprise, she turns him into a girl! Forced to head home in his new appearance, he unexpectedly comes across Aoi being harassed by a group of boys from their school, and rushes to her rescue. Surprisingly, she doesn't recognize him, and reveals a secret that she has held for a long time: she has an intense fear of men. Believing him to be a tomboyish girl, she asks Keitarou in his female guise to assist her in overcoming her fear.",
                 "Kusunoki used to believe he was destined for great things. Ostracized as a child, he held on to a belief that a good life was waiting for him in the years ahead. Now approaching the age of twenty, he's a completely mediocre college student with no motivation, no dreams, and no money. After learning he can sell his remaining years—and just how little they're worth—he chooses to divest himself of all but his last three months. Has Kusunoki truly destroyed his last chance to find happiness...or has he somehow found it?",
                 "Helpless and struggling for cash, 20-year-old Kusunoki sells the last of his possessions to buy food. Noticing his poverty, an old shop owner directs him to a store that supposedly purchases lifespan, time, and health. While not completely believing the man's words, Kusunoki nevertheless finds himself at the address out of desperation and curiosity. Kusunoki is crushed when he finds out the true monetary value of his lifespan—totaling a meager three hundred thousand yen. Deciding to sell the next 30 years of his life for ten thousand yen per year, Kusunoki is left with only three months to live. After heading home with the money, he is greeted by an unexpected visitor: the same store clerk he sold his lifespan to. She introduces herself as Miyagi, the one tasked with the job of observing him until the last three days of his life. Jumyou wo Kaitotte Moratta. Ichinen ni Tsuki, Ichimanen de. follows the remaining three months of Kusunoki's life as he confronts lingering regrets from the past and discovers what truly gives life value.",
                 "Shion and Rui are the dream team when it comes to hitting on women. Tonight was going to be another night of hooking up with girls for Shion, but he ended up taking a strange drug. When he woke up... he'd turned into a girl?! Rui came looking for Shion, but didn't recognize him, and started hitting on him..."]

In [738]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_requests)
print(results)

for i in range(len(results)):
  print('Predicted genres:')
  for idx,val in enumerate(results[i]):
    if val > 0.7:
      print(genre_encoder.classes_[idx])
  print('\n')

[[0.028682438656687737, 0.004946242086589336, 0.004648186266422272, 0.001957339234650135, 0.0009801577543839812, 0.00024222316278610379, 0.03139537572860718, 0.0015035879332572222, 0.005666667129844427, 8.708362292964011e-05, 0.18622522056102753, 0.008967697620391846, 0.004541065078228712, 0.0030598859302699566, 0.1774674952030182, 0.05387292429804802, 0.0003513977280817926, 0.025929395109415054, 0.029037175700068474, 0.017344282940030098, 0.0445873998105526, 0.001290512620471418, 0.008692910894751549, 0.052342288196086884, 0.8583765625953674, 0.0022367341443896294, 0.004768757149577141, 0.02067689411342144, 0.0005320966592989862, 8.626159251434729e-05, 0.0007426352822221816, 0.0017808370757848024, 0.027229048311710358, 0.010400562547147274, 0.010620174929499626, 0.0005447675357572734, 0.005568406078964472, 0.0005362792289815843, 0.0007668640464544296, 0.00024601133191026747, 0.003402964910492301, 0.008836920373141766, 0.030724642798304558, 0.008686971850693226, 0.003107145195826888, 0