In [1]:
from animeGPT.tokenizer import Tokenizer
import json
import os
import numpy as np
import re

In [2]:
dir_path = os.path.abspath(os.path.join(os.getcwd()))
details_path = os.path.join(dir_path, 'data/details.json')

In [3]:
synopses = []
titles = []
genres = []
with open(details_path, 'r') as file:
    anime_details = json.load(file)
    for anime in anime_details['anime']:
        synopses.append(anime['synopsis'])
        titles.append(anime['title'])
        try :
            genres.append([genre['name'] for genre in anime['genres']])
        except KeyError:
            genres.append([])


In [4]:
def preprocess_synopsis(synopsis):
    # Remove line breaks and extra whitespace
    synopsis = re.sub(r'\s+', ' ', synopsis)
    # Remove leading and trailing whitespace
    synopsis = synopsis.strip()
    return synopsis

def prepare_text_file(samples, filename):
    # Concatenate all the samples into a single string
    text = '\n'.join(samples)
    
    # Write the concatenated string to a text file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)
        

def contains_non_english(text):
    # Define a regular expression pattern to match non-English characters
    non_english_pattern = re.compile(r'[^\x00-\x7F]')
    
    # Check if the text contains any non-English characters
    if non_english_pattern.search(text):
        return True
    else:
        return False

In [6]:
synopses = [preprocess_synopsis(synopsis) for synopsis in synopses]
non_english_indexes = []
for i, sample in enumerate(synopses):
    if contains_non_english(sample) == True:
        non_english_indexes.append(i)
non_english_indexes = sorted(non_english_indexes, reverse=True)
for idx in non_english_indexes:
    del synopses[idx]
    del titles[idx]
    del genres[idx]

In [7]:
genres = [', '.join(sub) for sub in genres]
    
train_text = (titles + genres + synopses)
train_text = ', '.join(train_text)

In [8]:
animeTokenizer = Tokenizer()
animeTokenizer.train(train_text, 5000)

In [11]:
special_tokens = {
    '<t>' : 5000,
    '</t>' : 5001,
    '<g>' : 5002,
    '</g>' : 5003,
    '<s>' : 5004,
    '</s>' : 5005
}
animeTokenizer.register_special_tokens(special_tokens)
animeTokenizer.save('animeTokenizer')

In [12]:
animeTokenizer.load('animeTokenizer.model')

In [14]:
len(animeTokenizer.encode(synopses[0], 'all'))

295

In [15]:
lengths = [len(s) for s in synopses]
np.percentile(lengths, 10)

48.0

In [48]:
for idx, synopsis in enumerate(synopses):
    if len(synopsis) <= np.percentile(lengths, 10):
        del synopses[idx]
        del titles[idx]
        del genres[idx]

In [49]:
synopses = [f'<s>{sample}</s>' for sample in synopses]
titles = [f'<t>{title}</t>' for title in titles]
genres = [f'<g>{genre}</g>' for genre in genres]

In [50]:
synopses[0]

'<s>Centuries ago, mankind was slaughtered to near extinction by monstrous humanoid creatures called Titans, forcing humans to hide in fear behind enormous concentric walls. What makes these giants truly terrifying is that their taste for human flesh is not born out of hunger but what appears to be out of pleasure. To ensure their survival, the remnants of humanity began living within defensive barriers, resulting in one hundred years without a single titan encounter. However, that fragile calm is soon shattered when a colossal Titan manages to breach the supposedly impregnable outer wall, reigniting the fight for survival against the man-eating abominations. After witnessing a horrific personal loss at the hands of the invading creatures, Eren Yeager dedicates his life to their eradication by enlisting into the Survey Corps, an elite military unit that combats the merciless humanoids outside the protection of the walls. Eren, his adopted sister Mikasa Ackerman, and his childhood frien

In [63]:
input_data = []
for s, t, g in zip(synopses, titles, genres):
    input_data.extend([t, g, s])

In [65]:
print(input_data[99])
print(input_data[100])
print(input_data[101])

<t>Ansatsu Kyoushitsu 2nd Season</t>
<g>Action, Comedy, School, Shounen</g>
<s>Returning from their summer vacation, the students of Class 3-E at the prestigious Kunugigaoka Middle School find themselves up against unbeatable odds. Faced with the possibility of world annihilation, the students must come up with increasingly elaborate and creative ways to kill their teacher, the cunning yet optimistic and helpful Koro-sensei. However, eliminating Koro-sensei is not the only objective the students need to worry about. Gakuhou Asano, the academy's merciless and cruel principal, seeks to prevent Class 3-E's success by brainwashing his other hard-working pupils into ruthlessly competitive studying machines. Hostility begins to linger in the air as traitors and killers alike attempt to claim the bounty on Koro-sensei's head for themselves. Nagisa Shiota, one of Class 3-E's most skilled assassins, finds himself in the middle of the conflict. While he works to maintain his academic standing an

In [67]:
print(animeTokenizer.encode(titles[33], 'all'))
print(animeTokenizer.encode(input_data[99], 'all'))

[2000, 65, 110, 115, 989, 1225, 409, 104, 1543, 32, 50, 1519, 1390, 2001]
[2000, 65, 110, 115, 989, 1225, 409, 104, 1543, 32, 50, 1519, 1390, 2001]


In [69]:
prepare_text_file(input_data, 'input.txt')