# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset

In [2]:
df=pd.read_csv("../data/processed/anime_processed.csv")

In [3]:
df.head()

Unnamed: 0,Name,Image URL,Tags
0,Cowboy Bebop,https://cdn.myanimelist.net/images/anime/4/196...,"Crime is timeless. By the year 2071, humanity ..."
1,Cowboy Bebop: Tengoku no Tobira,https://cdn.myanimelist.net/images/anime/1439/...,"Another day, another bounty—such is the life o..."
2,Trigun,https://cdn.myanimelist.net/images/anime/7/203...,"Vash the Stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,https://cdn.myanimelist.net/images/anime/10/19...,Robin Sena is a powerful craft user drafted in...
4,Bouken Ou Beet,https://cdn.myanimelist.net/images/anime/7/215...,It is the dark century and the people are suff...


# Convert 'Tags' to lower

In [4]:
df["Tags"]=df["Tags"].apply(lambda x:x.lower())

In [5]:
df.head()

Unnamed: 0,Name,Image URL,Tags
0,Cowboy Bebop,https://cdn.myanimelist.net/images/anime/4/196...,"crime is timeless. by the year 2071, humanity ..."
1,Cowboy Bebop: Tengoku no Tobira,https://cdn.myanimelist.net/images/anime/1439/...,"another day, another bounty—such is the life o..."
2,Trigun,https://cdn.myanimelist.net/images/anime/7/203...,"vash the stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,https://cdn.myanimelist.net/images/anime/10/19...,robin sena is a powerful craft user drafted in...
4,Bouken Ou Beet,https://cdn.myanimelist.net/images/anime/7/215...,it is the dark century and the people are suff...


# Vectorization

## Using TF-IDF Vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf= TfidfVectorizer(stop_words="english",ngram_range=(1,2),max_features=5000, min_df=2, max_df=0.85, sublinear_tf=True)

## Lemmatize

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split()])

In [10]:
df["Tags"]= df["Tags"].apply(lemmatize)

## Transform 'Tags' into 5000 features

In [11]:
vectors= tfidf.fit_transform(df["Tags"])

In [12]:
tfidf.get_feature_names_out()

array(['000', '10', '10 year', ..., 'zombie', 'zone', 'zoo'],
      shape=(5000,), dtype=object)

# Finding closest anime

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
def recommend(anime, top_n=5):
    idx = df[df["Name"] == anime].index[0]

    sim_scores = cosine_similarity(vectors[idx], vectors).flatten()

    top_indices = np.argsort(sim_scores)[-top_n-1:-1][::-1]

    for i in top_indices:
        print(df.iloc[i]["Name"])

In [15]:
recommend("Naruto")

Naruto (2023)
Naruto: Shippuuden - Shippuu! "Konoha Gakuen" Den
Boruto: Naruto Next Generations
Naruto: Shippuuden
Boruto: Naruto the Movie


# Save final dataset and similarity

In [16]:
df.to_csv("../final_anime.csv", index=False)

In [17]:
df.head()

Unnamed: 0,Name,Image URL,Tags
0,Cowboy Bebop,https://cdn.myanimelist.net/images/anime/4/196...,"crime is timeless. by the year 2071, humanity ..."
1,Cowboy Bebop: Tengoku no Tobira,https://cdn.myanimelist.net/images/anime/1439/...,"another day, another bounty—such is the life o..."
2,Trigun,https://cdn.myanimelist.net/images/anime/7/203...,"vash the stampede is the man with a $$60,000,0..."
3,Witch Hunter Robin,https://cdn.myanimelist.net/images/anime/10/19...,robin sena is a powerful craft user drafted in...
4,Bouken Ou Beet,https://cdn.myanimelist.net/images/anime/7/215...,it is the dark century and the people are suff...


In [18]:
import pickle

In [19]:
pickle.dump(vectors, open("../vectors.pkl", "wb"))