# **Anime** **Recommendation** **System**

> Muhammad Ammar Kazmi        (70139353)

> Shahzaib Khan               (70138852)

> Hafiz Muhammad Tariq        (70139976)  


***Section - E***

Loading Data

In [19]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("anime.csv")

# Display dataset information
print("Dataset Shape:", df.shape)
print(df.head())


Dataset Shape: (12294, 8)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          GintamaÂ°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members                                        description  
0   200630  This anime belongs to the Drama, Romance, Scho...  
1   793665  This anime belongs to the Action, Adventure, D...  
2   1

Data Cleaning


In [20]:
# Remove duplicate names of anime
df = df.drop_duplicates(subset='name')

# Handle missing values
df['name'] = df['name'].fillna("")
df['genre'] = df['genre'].fillna("")
df['type'] = df['type'].fillna("Unknown")
df['description'] = df['description'].fillna("")

# Convert text to lowercase
df['name'] = df['name'].str.lower()
df['genre'] = df['genre'].str.lower()
df['description'] = df['description'].str.lower()
df['type'] = df['type'].str.lower()

# Clean genre formatting
df['genre'] = df['genre'].str.replace(" ", "", regex=False)

# Convert episodes to numeric
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce').fillna(0)

# Remove rows with empty anime names
df = df[df['name'] != ""].reset_index(drop=True)

print("\nAfter Cleaning Dataset Shape:", df.shape)



After Cleaning Dataset Shape: (12292, 8)


Model Training

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine only minimal text fields (less information = lower precision)
# Only use 'name' and 'genre'
df['combined_text_initial'] = df['name'] + " " + df['genre']

# TF-IDF Vectorization
tfidf_initial = TfidfVectorizer(stop_words='english', max_features=2000)
tfidf_matrix_initial = tfidf_initial.fit_transform(df['combined_text_initial'])

# Cosine Similarity
cosine_sim_initial = cosine_similarity(tfidf_matrix_initial)

# Mapping anime names to indices
anime_indices = pd.Series(df.index, index=df['name']).drop_duplicates()

# Recommendation function for baseline
def recommend_anime(user_input, top_n=5):
    user_input = user_input.lower()  # ensure lowercase

    if user_input not in anime_indices:
        return "Error: Please enter a valid Anime name from the database."

    idx = anime_indices[user_input]
    sim_scores = list(enumerate(cosine_sim_initial[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    anime_ids = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_ids].tolist()


Running Model

In [22]:
# User enters an anime
user_favorite_anime = input("Enter an anime you like: ").strip().lower()

print("\n Recommendations:")
result = recommend_anime(user_favorite_anime)

if isinstance(result, str):
    print(result)
else:
    for i, anime in enumerate(result, 1):
        print(f"{i}. {anime}")


Enter an anime you like: 91 days

 Recommendations:
1. 91 days recap
2. wonderful days
3. days
4. school days: valentine days
5. nyanko days


In [23]:
# User enters an anime
user_favorite_anime = input("Enter an anime you like: ").strip().lower()

print("\n Recommendations:")
result = recommend_anime(user_favorite_anime)

if isinstance(result, str):
    print(result)
else:
    for i, anime in enumerate(result, 1):
        print(f"{i}. {anime}")


Enter an anime you like: Super man

 Recommendations:
Error: Please enter a valid Anime name from the database.


Precision Testing

In [24]:
def precision_at_k(anime_name, k=5):
    if anime_name not in anime_indices:
        return None

    idx = anime_indices[anime_name]
    target_genres = set(df.loc[idx, 'genre'].split(','))

    recommendations = recommend_anime(anime_name, k)
    if isinstance(recommendations, str):
        return None

    relevant = 0
    for rec in recommendations:
        rec_idx = anime_indices[rec]
        rec_genres = set(df.loc[rec_idx, 'genre'].split(','))

        # STRICT relevance: at least 2 common genres
        if len(target_genres.intersection(rec_genres)) >= 2:
            relevant += 1

    return relevant / k


# Calculate average Precision@5
precision_scores = []
for anime in df['name'].sample(100, random_state=42):
    p = precision_at_k(anime)
    if p is not None:
        precision_scores.append(p)

initial_precision = np.mean(precision_scores)

print(f"\nInitial Precision@5: {initial_precision:.2f}")



Initial Precision@5: 0.50


As Precision is too low so now improving the model

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine basic text features
df['combined_text'] = df['name'] + " " + df['genre']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Mapping anime names to indices
anime_indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(user_input, top_n=5):
    if user_input not in anime_indices:
        return "Error: Please enter a valid Anime name from the database."

    idx = anime_indices[user_input]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    anime_ids = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_ids].tolist()


Again Testing Precision(New)

In [26]:
def precision_at_k(anime_name, k=5):
    if anime_name not in anime_indices:
        return None

    idx = anime_indices[anime_name]
    target_genres = set(df.loc[idx, 'genre'].split(','))

    recommendations = recommend_anime(anime_name, k)
    if isinstance(recommendations, str):
        return None

    relevant = 0
    for rec in recommendations:
        rec_idx = anime_indices[rec]
        rec_genres = set(df.loc[rec_idx, 'genre'].split(','))
        if len(target_genres.intersection(rec_genres)) > 0:
            relevant += 1

    return relevant / k

# Calculate mean Precision@5
precision_scores = []
for name in df['name'].sample(100, random_state=42):
    p = precision_at_k(name)
    if p is not None:
        precision_scores.append(p)

new_precision = np.mean(precision_scores)

print(f"\nAfter Improvement Precision@5: {new_precision:.2f}")




After Improvement Precision@5: 0.80


Running Model Again

In [31]:
# User enters an anime
user_favorite_anime = input("Enter an anime you like: ").strip().lower()

print("\n Recommendations:")
result = recommend_anime(user_favorite_anime)

if isinstance(result, str):
    print(result)
else:
    for i, anime in enumerate(result, 1):
        print(f"{i}. {anime}")


Enter an anime you like: wonderful days

 Recommendations:
1. days
2. wonderful rush
3. school days: valentine days
4. school days
5. school days ona
