In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# 1. Load the Dataset
anime_df = pd.read_csv("anime-filtered.csv")

# Drop any rows with missing values
anime_df.dropna(inplace=True)

In [3]:
# 2. Data Exploration
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(anime_df.head())

# Check for missing values
print("\nMissing values in the dataset:")
print(anime_df.isnull().sum())


First few rows of the dataset:
   anime_id                             Name  Score  \
0         1                     Cowboy Bebop   8.78   
1         5  Cowboy Bebop: Tengoku no Tobira   8.39   
2         6                           Trigun   8.24   
3         7               Witch Hunter Robin   7.27   
4         8                   Bouken Ou Beet   6.98   

                                              Genres            English name  \
0    Action, Adventure, Comedy, Drama, Sci-Fi, Space            Cowboy Bebop   
1              Action, Drama, Mystery, Sci-Fi, Space  Cowboy Bebop:The Movie   
2  Action, Sci-Fi, Adventure, Comedy, Drama, Shounen                  Trigun   
3  Action, Mystery, Police, Supernatural, Drama, ...      Witch Hunter Robin   
4          Adventure, Fantasy, Shounen, Supernatural  Beet the Vandel Buster   

                      Japanese name  \
0                         カウボーイビバップ   
1                    カウボーイビバップ 天国の扉   
2                             トライガン   
3

In [4]:
# 3. Data Preprocessing
# Preprocess the Genres column (remove whitespace and convert to lowercase)
anime_df["Genres"] = anime_df["Genres"].str.strip().str.lower()

# Display the first few rows after preprocessing
print("\nFirst few rows after preprocessing:")
print(anime_df.head())


First few rows after preprocessing:
   anime_id                             Name  Score  \
0         1                     Cowboy Bebop   8.78   
1         5  Cowboy Bebop: Tengoku no Tobira   8.39   
2         6                           Trigun   8.24   
3         7               Witch Hunter Robin   7.27   
4         8                   Bouken Ou Beet   6.98   

                                              Genres            English name  \
0    action, adventure, comedy, drama, sci-fi, space            Cowboy Bebop   
1              action, drama, mystery, sci-fi, space  Cowboy Bebop:The Movie   
2  action, sci-fi, adventure, comedy, drama, shounen                  Trigun   
3  action, mystery, police, supernatural, drama, ...      Witch Hunter Robin   
4          adventure, fantasy, shounen, supernatural  Beet the Vandel Buster   

                      Japanese name  \
0                         カウボーイビバップ   
1                    カウボーイビバップ 天国の扉   
2                             トライガ

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 4. Similarity Assessment
# Select relevant features for similarity assessment
features = ["Genres", "Score", "Popularity"]

# Compute TF-IDF vectors for the Genres column
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(anime_df["Genres"])

# Compute cosine similarity between anime series
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Identify Top Similar Anime Series
# Function to get top similar anime series for a given query
def get_top_similar(query_idx, anime_df, cosine_sim):
    sim_scores = list(enumerate(cosine_sim[query_idx]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:11]  # Exclude the query itself
    top_similar_anime = [(anime_df.iloc[idx]["Name"], score) for idx, score in top_similar]
    return top_similar_anime

# Queries of interest
queries = ["Cowboy Bebop", "Naruto", "Monster"]

# Output top 10 similar anime series for each query
for query in queries:
    query_idx = anime_df[anime_df["Name"] == query].index[0]
    top_similar_anime = get_top_similar(query_idx, anime_df, cosine_sim)
    print(f"Query: {query}")
    print("Top 10 Similar Anime Series:")
    for anime, score in top_similar_anime:
        print(f"- {anime} (Similarity Score: {score:.2f})")
    print()


Query: Cowboy Bebop
Top 10 Similar Anime Series:
- Cowboy Bebop: Yose Atsume Blues (Similarity Score: 1.00)
- Odin: Koushi Hansen Starlight (Similarity Score: 0.97)
- Ginga Tetsudou Monogatari (Similarity Score: 0.97)
- Waga Seishun no Arcadia (Similarity Score: 0.97)
- Waga Seishun no Arcadia: Mugen Kidou SSX (Similarity Score: 0.97)
- Uchuu Kaizoku Captain Herlock: Arcadia-gou no Nazo (Similarity Score: 0.97)
- Ginga Tetsudou Monogatari: Eien e no Bunkiten (Similarity Score: 0.97)
- Seihou Bukyou Outlaw Star (Similarity Score: 0.93)
- Seihou Tenshi Angel Links (Similarity Score: 0.93)
- Sayonara Ginga Tetsudou 999: Andromeda Shuuchakueki (Similarity Score: 0.92)

Query: Naruto
Top 10 Similar Anime Series:
- Naruto: Shippuuden (Similarity Score: 1.00)
- Boruto: Jump Festa 2016 Special (Similarity Score: 1.00)
- Rekka no Honoo (Similarity Score: 0.98)
- Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!! (Similarity Score: 0.98)
- Naruto: Shippuuden Movie 6 - Road to Ninja (Simila

In [8]:
# Function to get top similar anime series for a given query
def get_top_similar(query_idx, anime_df, cosine_sim):
    sim_scores = list(enumerate(cosine_sim[query_idx]))
    sim_scores.sort(key=lambda x: (x[1], anime_df.iloc[x[0]]["Score"]), reverse=True)
    
    top_similar = sim_scores[1:11]  # Exclude the query itself
    top_similar_anime = []
    anime_names = set()  # To track unique anime names
    
    for idx, score in top_similar:
        anime_name = anime_df.iloc[idx]["Name"]
        if anime_name not in anime_names:
            top_similar_anime.append((anime_name, anime_df.iloc[idx]["Score"], anime_df.iloc[idx]["Aired"]))
            anime_names.add(anime_name)
    
    return top_similar_anime

# Output top 10 similar anime series for each query
for query in queries:
    query_idx = anime_df[anime_df["Name"] == query].index[0]
    top_similar_anime = get_top_similar(query_idx, anime_df, cosine_sim)
    print(f"Query: {query}")
    print("Top 10 Similar Anime Series:")
    for anime, score, aired in top_similar_anime:
        print(f"- {anime} (Similarity Score: {score:.2f}, Aired: {aired})")
    print()


Query: Cowboy Bebop
Top 10 Similar Anime Series:
- Cowboy Bebop: Yose Atsume Blues (Similarity Score: 7.44, Aired: Jun 26, 1998)
- Waga Seishun no Arcadia (Similarity Score: 7.49, Aired: Jul 28, 1982)
- Waga Seishun no Arcadia: Mugen Kidou SSX (Similarity Score: 7.40, Aired: Oct 13, 1982 to Mar 30, 1983)
- Ginga Tetsudou Monogatari (Similarity Score: 7.16, Aired: Oct 4, 2003 to Apr 4, 2004)
- Uchuu Kaizoku Captain Herlock: Arcadia-gou no Nazo (Similarity Score: 6.96, Aired: Jul 22, 1978)
- Ginga Tetsudou Monogatari: Eien e no Bunkiten (Similarity Score: 6.86, Aired: Oct 5, 2006 to Mar 29, 2007)
- Odin: Koushi Hansen Starlight (Similarity Score: 5.24, Aired: Aug 10, 1985)
- Seihou Bukyou Outlaw Star (Similarity Score: 7.87, Aired: Jan 9, 1998 to Jun 26, 1998)
- Seihou Tenshi Angel Links (Similarity Score: 5.96, Aired: Apr 7, 1999 to Jun 30, 1999)
- Ginga Tetsudou 999 (Similarity Score: 7.81, Aired: Sep 14, 1978 to Apr 9, 1981)

Query: Naruto
Top 10 Similar Anime Series:
- Naruto (Simila

In [14]:
for query in queries:
    query_idx = anime_df[anime_df["Name"] == query].index[0]
    top_similar_anime = get_top_similar(query_idx, anime_df, cosine_sim)

    print(f"Query: {query}")
    print("{:<50} {:<20} {:<10}".format("Anime", "Similarity Score", "Aired"))
    for anime, score, aired in top_similar_anime:
        print("{:<50} {:<20.2f} {:<10}".format(anime, score, aired))
    print()

Query: Cowboy Bebop
Anime                                              Similarity Score     Aired     
Cowboy Bebop: Yose Atsume Blues                    7.44                 Jun 26, 1998
Waga Seishun no Arcadia                            7.49                 Jul 28, 1982
Waga Seishun no Arcadia: Mugen Kidou SSX           7.40                 Oct 13, 1982 to Mar 30, 1983
Ginga Tetsudou Monogatari                          7.16                 Oct 4, 2003 to Apr 4, 2004
Uchuu Kaizoku Captain Herlock: Arcadia-gou no Nazo 6.96                 Jul 22, 1978
Ginga Tetsudou Monogatari: Eien e no Bunkiten      6.86                 Oct 5, 2006 to Mar 29, 2007
Odin: Koushi Hansen Starlight                      5.24                 Aug 10, 1985
Seihou Bukyou Outlaw Star                          7.87                 Jan 9, 1998 to Jun 26, 1998
Seihou Tenshi Angel Links                          5.96                 Apr 7, 1999 to Jun 30, 1999
Ginga Tetsudou 999                                 7.81  