In [1]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import kagglehub
from sklearn.model_selection import train_test_split
import requests
import os
import concurrent.futures



In [2]:
base_url = 'https://kitsu.io/api/edge/anime?filter[categories]=adventure'
# search_categories = f'/anime?filter[categories]={category}'
categories_url = 'https://kitsu.io/api/edge/categories?page[limit]=20&page[offset]=0'

def get_api(url) :
    r = requests.get(url)
    data = r.json()

    return data['data']

def get_all_categories():
    
    r = requests.get('https://kitsu.io/api/edge/categories?page[limit]=20&page[offset]=0')
    data = r.json()

    categories = []
    offset = 0
   
    while len(data['data']) != 0:
           r = requests.get(f'https://kitsu.io/api/edge/categories?page[limit]=20&page[offset]={offset}')
           data = r.json()
           for category in data['data']:
               categories.append({'slug':category['attributes']['slug'],
                                 'title':category['attributes']['title']}
                                )
           offset += 20

    return categories

all_categories = get_all_categories()




In [3]:
def get_all_animes_per_categorie(category_slug):

    url = f'https://kitsu.io/api/edge/anime?filter[categories]={category_slug}?page[limit]=20&page[offset]=0'
    r = requests.get(url)
    data = r.json()
    animes = []
    offset = 0
    seen_ids = set()
    
    while len(data['data']) != 0:
       url = f'https://kitsu.io/api/edge/anime?filter[categories]={category_slug}&page[limit]=20&page[offset]={offset}'
       r = requests.get(url)
       data = r.json()

       for anime in data['data']:
           anime_id = anime['id']
           if anime_id not in seen_ids:
               animes.append(anime)
               seen_ids.add(anime_id)
               
       offset += 20

    
    print('Found the amount of animes', len(animes))
    return animes        

In [4]:

def create_df(all_animes):
    algorithm_attributes = [
        "canonicalTitle",   #0 , string
        "slug",             #1 , string
        "synopsis",         #2 , string
        "description",      #3 , string     
        "nsfw",             #4 , boolean
        "averageRating",    #5 , float
        "popularityRank",   #6 , int
        "ratingRank",       #7 , int
        "subtype",          #8 , one-hot-encoding
        "ageRating",        #9 , one-hot-encoding
        "ratingFrequencies" #10 , one-hot-encoding [x,y,z,w]
    ]

    # Those attributes will be used in the algorithm
    X_animes = []
    for anime in all_animes:
        anime_attributes = []
        for attribute in algorithm_attributes:
            anime_attributes.append(anime['attributes'][attribute])
        anime_attributes.append(anime['id'])
        X_animes.append(anime_attributes)

    df = pd.DataFrame(X_animes, columns =[
    "canonicalTitle",   #0 , string
    "slug",                #1 , string
    "synopsis",            #2 , string
    "description",         #3 , string     
    "nsfw",                #4 , boolean
    "averageRating",       #5 , float
    "popularityRank",      #6  , int
    "ratingRank",          #7 , int
    "subtype",             #8 , one-hot-encoding
    "ageRating",           #9 , one-hot-encoding
    "ratingFrequencies",    #10 , one-hot-encoding [x,y,z,w]
    "id"                   #11 , drop later
])

    df = df.drop('ratingFrequencies', axis=1)
    df = df.drop('canonicalTitle',axis=1)

    df.dropna(inplace=True,how="any")

    
    

    return df



In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

def knn_model(df):

    # def weighted_slug_tfidf(X):
    #     tfidf_vectorizer = TfidfVectorizer(max_features=100)
    #     tfidf_slug = tfidf_vectorizer.fit_transform(X['slug'])
    #     weighted_slug = tfidf_slug * 2.0  
    #     return weighted_slug.toarray()

    preprocessor = ColumnTransformer(transformers=[
        ('subtype', OneHotEncoder(), ['subtype']),
        ('age_rating', OneHotEncoder(), ['ageRating']),
        ('slug_tdidf', TfidfVectorizer(max_features=500), 'synopsis'),
        ( 'tfidf_synopsis', TfidfVectorizer(max_features=10000), 'synopsis'),
        ('desc_tfidf', TfidfVectorizer(max_features=10000), 'description')
    ])

    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('recommend', NearestNeighbors(metric='cosine', algorithm='brute'))
    ])

    X = pipeline.named_steps['preprocessing'].fit_transform(df)

    pipeline.named_steps['recommend'].fit(X)

    distances, indices = pipeline.named_steps['recommend'].kneighbors(X[0].reshape(1, -1), n_neighbors=20)

    found_animes = []

    for indice in indices[0]:
        found_animes.append(df.iloc[indice]['id'])

    distances = distances.tolist()[0]
    
    return [found_animes, distances]
