1. Load data

In [23]:
import pandas as pd
import numpy as np
df_anime = pd.read_csv(r'G:\hoc\private\Anime\data\Recomended_Anime_data\raw\MyAnimeList-Database-master\data\anime.csv')
df_anime = df_anime.replace("Unknown", np.nan)
df_anime["Episodes"] = pd.to_numeric(df_anime["Episodes"], errors="coerce")
df_anime["Score"] = pd.to_numeric(df_anime["Score"], errors="coerce")


In [60]:
import re
def parse_duration(d):
    if pd.isna(d):   # skip NaN safely
        return np.nan
    
    # make sure it's a string
    d = str(d).lower()
    minutes = 0

    # extract hours (with or without dot, like "1 hr" or "1 hr.")
    hr_match = re.search(r"(\d+)\s*hr\.?", d)
    if hr_match:
        minutes += int(hr_match.group(1)) * 60
    
    # extract minutes (with or without dot, like "24 min" or "24 min.")
    min_match = re.search(r"(\d+)\s*min\.?", d)
    if min_match:
        minutes += int(min_match.group(1))
    
    return minutes if minutes > 0 else np.nan
# --- Similarity helpers ---
### Jaccard similarity for pipe-separated strings
#set_a & set_b → the intersection of the two sets (items they share).
#set_a | set_b → the union of the two sets (all unique items across both).
#len(set_a & set_b) → how many items they share.
#len(set_a | set_b) → how many unique items in total.
#The Jaccard similarity is the ratio of shared items to total unique items, giving a value between 0 and 1.
def jaccard_similarity(a, b):
    set_a = set(str(a).split(", "))
    set_b = set(str(b).split(", "))
    if not set_a or not set_b:
        return 0.0
    return len(set_a & set_b) / len(set_a | set_b)

## Scaled similarity for numeric values
#The formula 1 / (1 + |a - b|) ensures that as the difference |a - b| increases, the similarity score decreases.
#If either a or b is NaN (not a number), the function returns 0.0, indicating no similarity.
##abs(a - b) = the absolute difference between the two numbers.
## 1 + abs(a - b) = the bigger the difference, the bigger the denominator.
##Taking the reciprocal (1/x) means:
##Smaller differences → higher similarity (closer to 1).
##Larger differences → lower similarity (closer to 0

def scaled_similarity(a, b):
    if pd.isna(a) or pd.isna(b):
        return 0.0
    return 1 / (1 + abs(a - b))

def metadata_similarity(anime_a, anime_b, weights=None):
    ##the weights control how much each metadata feature contributes to the overall similarity.
    ##Genres (0.35) → Most important, since genre overlap usually drives what anime feels similar.
    ##Type (0.1) → TV vs Movie vs OVA matters, but less than genres.
    ##Episodes (0.1) → People often care about length (short vs long series).
    ##Studios (0.1) → Some studios have distinct styles (e.g. ufotable, Kyoto Animation).
    ##Source (0.1) → Fans of manga adaptations may prefer other manga adaptations.
    ##Duration (0.1) → Per-episode length matters a bit (5 min vs 24 min)
    ##Rating (0.05) → Lower impact, but prevents mismatches (e.g. “kids anime” vs “R+ gore”).
    ##Score (0.1) → Ensures recommended shows are close in community rating.

    if weights is None:
        weights = {
            "Genres": 0.35,
            "Type": 0.1,
            "Episodes": 0.1,
            "Studios": 0.1,
            "Source": 0.1,
            "Duration": 0.1,
            "Rating": 0.05,
            "Score": 0.1,
            "Producers": 0.05,
        }

    sims = {}

    sims["Genres"]   = jaccard_similarity(anime_a["Genres"], anime_b["Genres"]) ##similarity between two sets of genres
    sims["Type"]     = 1.0 if anime_a["Type"] == anime_b["Type"] else 0.0  ##exact match of type (TV, Movie, OVA, etc.)
    sims["Episodes"] = scaled_similarity(anime_a["Episodes"], anime_b["Episodes"]) ## closeness in number of episodes
    sims["Studios"]  = 1.0 if anime_a["Studios"] == anime_b["Studios"] else 0.0  ## exact match of studio
    sims["Source"]   = 1.0 if anime_a["Source"] == anime_b["Source"] else 0.0 ## exact match of source material (Manga, Light Novel, Original, etc.)
    sims["Duration"] = scaled_similarity(anime_a["Duration"], anime_b["Duration"]) ## closeness in episode duration (in minutes)
    sims["Rating"]   = 1.0 if anime_a["Rating"] == anime_b["Rating"] else 0.0 ## exact match of content rating (PG, R, etc.)
    sims["Score"]    = scaled_similarity(anime_a["Score"], anime_b["Score"]) ## closeness in community score (0-10 scale)
    sims["Producers"] = 1.0 if anime_a["Producers"] == anime_b["Producers"] else 0.0 ## match of producers
    total = sum(weights[k] * sims[k] for k in sims)
    return total, sims

def recommend_by_metadata(anime_df, liked_names, top_n=10):
    liked_anime = anime_df[anime_df["Name"].isin([liked_names])]
    if anime_df["Name"].isin([liked_names]).sum() == 0:
        liked_anime = anime_df[anime_df["English name"].isin([liked_names])]
    recs = []

    for idx, candidate in anime_df.iterrows():
        if candidate["Name"] in liked_names:
            continue  # skip already liked

        scores = []
        breakdowns = []

        for _, liked in liked_anime.iterrows():
            sim, sims_detail = metadata_similarity(liked, candidate)
            scores.append(sim)

            # Build explanation
            reasons = []
            if sims_detail["Genres"] > 0:
                reasons.append(f"shares Genres ({sims_detail['Genres']:.2f})")
            if sims_detail["Type"] == 1.0:
                reasons.append("same type")
            if sims_detail["Source"] == 1.0:
                reasons.append("same source material")
            if sims_detail["Rating"] == 1.0:
                reasons.append("same age rating")
            if sims_detail["Studios"] == 1.0:
                reasons.append("same studio")
            if sims_detail["Episodes"] > 0.8:
                reasons.append("similar episode count")
            if sims_detail["Duration"] > 0.8:
                reasons.append("similar episode duration")
            if sims_detail["Score"] > 0.8:
                reasons.append("similar user score")
            if sims_detail["Producers"] == 1.0:
                reasons.append("same producer")

            breakdowns.append(", ".join(reasons))

        if scores:
            avg_score = sum(scores) / len(scores)
            explanation = " | ".join(breakdowns[:2])  # show 2 strongest matches
            recs.append((candidate["Name"], avg_score, explanation))

    recs = sorted(recs, key=lambda x: x[1], reverse=True)

    return recs[:top_n]

def recommend_for_each_favorite(anime_df, favorites, top_n=5):
    results = {}
    for fav in favorites:
        if fav not in anime_df["English name"].values and fav not in anime_df["Name"].values:
            print(f"Dont have {fav} in the dataset")
            continue  # skip if anime not in dataset
        # Get recommendations for this one favorite
        
        recs = recommend_by_metadata(anime_df, fav, top_n=top_n)  # <-- your existing function
        results[fav] = recs
    
    return results


In [67]:
user_likes = ["KonoSuba:God's Blessing on This Wonderful World!", "Monster","Ijiranaide, Nagatoro-san"]
df_anime["Duration"] = df_anime["Duration"].apply(parse_duration)
recommendations = recommend_for_each_favorite(df_anime, user_likes, top_n=10)
for key,recommendation in recommendations.items():
    print(f"Recommendations based on your favorite: {key}")
    for Name, score, explanation in recommendation:
        print(f"{Name} (similarity: {score:.3f}) → {explanation}")

Recommendations based on your favorite: KonoSuba:God's Blessing on This Wonderful World!
Kono Subarashii Sekai ni Shukufuku wo! (similarity: 0.950) → shares Genres (1.00), same type, same source material, same age rating, same studio, similar episode count, similar user score, same producer
Kono Subarashii Sekai ni Shukufuku wo! 2 (similarity: 0.886) → shares Genres (1.00), same type, same source material, same age rating, same studio, similar episode count, similar user score
Kono Subarashii Sekai ni Shukufuku wo! 2: Kono Subarashii Geijutsu ni Shukufuku wo! (similarity: 0.644) → shares Genres (0.83), same source material, same age rating, same studio, similar user score
Kono Subarashii Sekai ni Shukufuku wo!: Kono Subarashii Choker ni Shukufuku wo! (similarity: 0.629) → shares Genres (0.83), same source material, same age rating, same studio
Reikenzan: Hoshikuzu-tachi no Utage (similarity: 0.607) → shares Genres (0.50), same type, same source material, same age rating, same studio
Re