In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import matplotlib.pyplot as plt

In [23]:
nlp = spacy.load('en_core_web_sm')

In [5]:
df = pd.read_csv("anime-dataset-2023.csv")
df.head(1)

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...


In [7]:
df1 = df[["anime_id","Name","Score","Genres","Synopsis","Type","Episodes","Studios"]]

In [9]:
df1.head(1)

Unnamed: 0,anime_id,Name,Score,Genres,Synopsis,Type,Episodes,Studios
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,Sunrise


In [10]:
df1.isnull().sum()

anime_id    0
Name        0
Score       0
Genres      0
Synopsis    0
Type        0
Episodes    0
Studios     0
dtype: int64

In [40]:
df1.to_csv("anime_data.csv")

In [24]:
def preprocess(text):
    doc = nlp(text.lower())  # Converting text to lowercase
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return ' '.join(tokens)

In [25]:

df1['combined_features'] = df1['Genres'] + " " + df1['Synopsis']+ " " + df1['Studios']

df1['processed_content'] = df1['combined_features'].apply(preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['combined_features'] = df1['Genres'] + " " + df1['Synopsis']+ " " + df1['Studios']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['processed_content'] = df1['combined_features'].apply(preprocess)


In [42]:
df1.to_csv("preprocessed_anime_dataset.csv", index=False)

In [26]:

tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(df1['processed_content'])


In [27]:
# Finding the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [41]:
cosine_sim

array([[1.        , 0.28342319, 0.03822954, ..., 0.03881367, 0.        ,
        0.        ],
       [0.28342319, 1.        , 0.04794411, ..., 0.0628194 , 0.        ,
        0.        ],
       [0.03822954, 0.04794411, 1.        , ..., 0.05708258, 0.        ,
        0.        ],
       ...,
       [0.03881367, 0.0628194 , 0.05708258, ..., 1.        , 0.03928656,
        0.04169494],
       [0.        , 0.        , 0.        , ..., 0.03928656, 1.        ,
        0.3767577 ],
       [0.        , 0.        , 0.        , ..., 0.04169494, 0.3767577 ,
        1.        ]])

In [29]:
# Reset index to ensure it starts from 0
df1 = df1.reset_index(drop=True)

# Create a reverse mapping of indices and anime names
indices = pd.Series(df1.index, index=df1['Name']).drop_duplicates()


In [30]:
indices.head()

Name
Cowboy Bebop                       0
Cowboy Bebop: Tengoku no Tobira    1
Trigun                             2
Witch Hunter Robin                 3
Bouken Ou Beet                     4
dtype: int64

In [38]:
def get_recommendations(anime_name, cosine_sim=cosine_sim):

    try:
        # Get the index of the anime that matches the name
        idx = indices[anime_name]
    except KeyError:
        print(f"Anime '{anime_name}' not found in the database.")
        return pd.DataFrame()
    
    # Get the pairwise similarity scores of all anime with the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top 6 most similar anime (excluding itself)
    sim_scores = sim_scores[1:5]
    
    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar anime
    return df[['Name', 'Score']].iloc[anime_indices].reset_index(drop=True)


In [39]:

anime_input = 'Naruto' 
recommendations = get_recommendations(anime_input)

if not recommendations.empty:
    print(f"Top 5 recommendations for '{anime_input}':\n")
    print(recommendations)


Top 5 recommendations for 'Naruto':

                                         Name    Score
0                               Naruto (2023)  UNKNOWN
1                          Naruto: Shippuuden     8.26
2             Boruto: Naruto Next Generations     6.06
3  Naruto: Shippuuden Movie 6 - Road to Ninja     7.68
