In [1]:
import numpy as np
import pandas as pd
import faiss
from rapidfuzz import process
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Load anime dataset
df = pd.read_csv(r"G:\hoc\private\Anime\data\Recomended_Anime_data\raw\MyAnimeList-Database-master\data\anime.csv")   # adjust path
df_sypnosis = pd.read_csv(r"G:\hoc\private\Anime\data\Recomended_Anime_data\raw\MyAnimeList-Database-master\data\anime_with_synopsis.csv")  # adjust path
# Load saved embeddings
embeddings = np.load("anime_embeddings.npy").astype("float32")
df["Episodes"] = pd.to_numeric(df["Episodes"], errors="coerce")
df["Score"] = pd.to_numeric(df["Score"], errors="coerce")
# Normalize embeddings for FAISS
faiss.normalize_L2(embeddings)

# Build FAISS index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Title → index mapping
Name_to_idx = {Name: i for i, Name in enumerate(df["Name"])}


In [3]:
import re
def parse_duration(d):
    if pd.isna(d):   # skip NaN safely
        return np.nan
    
    # make sure it's a string
    d = str(d).lower()
    minutes = 0

    # extract hours (with or without dot, like "1 hr" or "1 hr.")
    hr_match = re.search(r"(\d+)\s*hr\.?", d)
    if hr_match:
        minutes += int(hr_match.group(1)) * 60
    
    # extract minutes (with or without dot, like "24 min" or "24 min.")
    min_match = re.search(r"(\d+)\s*min\.?", d)
    if min_match:
        minutes += int(min_match.group(1))
    
    return minutes if minutes > 0 else np.nan

In [4]:
def find_closest_title(query, titles, limit=1, score_cutoff=70):
    matches = process.extract(query, titles, limit=limit, score_cutoff=score_cutoff)
    if not matches:
        return None
    return matches[0][0]



In [5]:
def jaccard_similarity(set_a, set_b):
    if not set_a or not set_b: return 0.0
    return len(set_a & set_b) / len(set_a | set_b)

def scaled_similarity(a, b):
    if pd.isna(a) or pd.isna(b): return 0.0
    return 1 / (1 + abs(a - b))

def metadata_similarity(idx_a, idx_b):
    row_a, row_b = df.iloc[idx_a], df.iloc[idx_b]

    sims = []
    # Genres
    sims.append(jaccard_similarity(set(row_a["Genres"].split(", ")),
                                   set(row_b["Genres"].split(", "))))
    # Type
    sims.append(1.0 if row_a["Type"] == row_b["Type"] else 0.0)
    # Episodes
    sims.append(scaled_similarity(row_a["Episodes"], row_b["Episodes"]))
    # Studios
    sims.append(jaccard_similarity(set(str(row_a["Studios"]).split(", ")),
                                   set(str(row_b["Studios"]).split(", "))))
    # Source
    sims.append(1.0 if row_a["Source"] == row_b["Source"] else 0.0)
    # Rating
    sims.append(1.0 if row_a["Rating"] == row_b["Rating"] else 0.0)
    # Score
    sims.append(scaled_similarity(row_a["Score"], row_b["Score"]))
    # Duration
    dur_a = parse_duration(row_a["Duration"])
    dur_b = parse_duration(row_b["Duration"])
    sims.append(scaled_similarity(dur_a, dur_b))
    # Producers
    sims.append(1.0 if row_a["Producers"] == row_b["Producers"] else 0.0 ),

    return np.mean(sims)  # simple average, can be weighted



In [6]:
def hybrid_recommend(query, k=10, alpha=0.5):
    """
    alpha = weight for FAISS (semantic similarity)
    (1-alpha) = weight for metadata similarity
    """
    # Fuzzy match title
    best_match = find_closest_title(query, df["Name"].tolist())
    if not best_match:
        raise ValueError(f"No anime found for '{query}'")

    anime_idx = Name_to_idx[best_match]

    # FAISS search
    qvec = embeddings[anime_idx].reshape(1, -1)
    faiss.normalize_L2(qvec)
    scores, indices = index.search(qvec, k+20)  # grab more, filter later
    indices, scores = indices[0], scores[0]

    results = []
    for idx, s in zip(indices, scores):
        if idx == anime_idx:  # skip self
            continue
        meta_sim = metadata_similarity(anime_idx, idx)
        hybrid_score = alpha * s + (1 - alpha) * meta_sim
        results.append((idx, hybrid_score))

    # Sort by hybrid score
    results = sorted(results, key=lambda x: x[1], reverse=True)[:k]

    rec_df = df.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
    rec_df["hybrid_score"] = [r[1] for r in results]

    return best_match, rec_df.reset_index(drop=True)


In [7]:
query = "naruto shippuden"
match, recs = hybrid_recommend(query, k=5, alpha=0.6)

print("You searched for:", query)
print("Best match:", match)
print(recs)

You searched for: naruto shippuden
Best match: Naruto: Shippuuden
                            Name  Score  \
0                   Okane ga Nai   6.26   
1              Catman Series III   6.36   
2                 Bouken Ou Beet   6.98   
3  Super Fishing Grander Musashi   6.87   
4                    Dennou Coil   8.09   

                                      Genres  hybrid_score  
0                       Drama, Romance, Yaoi      0.547491  
1             Comedy, Fantasy, Slice of Life      0.497109  
2  Adventure, Fantasy, Shounen, Supernatural      0.458866  
3                          Adventure, Sports      0.434771  
4  Adventure, Comedy, Drama, Mystery, Sci-Fi      0.426981  


In [8]:
def build_user_vector(titles):
    indices = []
    for t in titles:
        match = find_closest_title(t, df["Name"].tolist())
        if match:
            indices.append(Name_to_idx[match])
    
    if not indices:
        raise ValueError("No valid titles found in user history")
    
    # Average embedding
    user_vec = embeddings[indices].mean(axis=0, keepdims=True)
    faiss.normalize_L2(user_vec)
    
    return indices, user_vec


In [9]:
def hybrid_recommend_multi(Name, k=10, alpha=0.5):
    """
    Recommend based on multiple anime the user likes.
    """
    liked_indices, user_vec = build_user_vector(Name)

    # Search in FAISS
    scores, indices = index.search(user_vec, k+50)  # grab more, filter later
    indices, scores = indices[0], scores[0]

    results = []
    for idx, s in zip(indices, scores):
        if idx in liked_indices:  # skip already liked
            continue
        
        # Compare with ALL liked anime via metadata
        meta_sims = [metadata_similarity(idx, li) for li in liked_indices]
        meta_sim = np.mean(meta_sims)
        
        # Hybrid score
        hybrid_score = alpha * s + (1 - alpha) * meta_sim
        results.append((idx, hybrid_score))

    # Sort by hybrid score
    results = sorted(results, key=lambda x: x[1], reverse=True)[:k]

    rec_df = df.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
    rec_df["hybrid_score"] = [r[1] for r in results]

    return Name, rec_df.reset_index(drop=True)


In [10]:
user_likes = ["Naruto", "Attack on Titan", "One Piece"]

liked, recs = hybrid_recommend_multi(user_likes, k=7, alpha=0.6)

print("User likes:", liked)
print("Recommended:")
print(recs)


User likes: ['Naruto', 'Attack on Titan', 'One Piece']
Recommended:
                                          Name  Score  \
0                       iDOLM@STER Xenoglossia   6.52   
1                            Tsubasa Chronicle   7.55   
2                        Kidou Senkan Nadesico   7.52   
3                                   Weiß Kreuz   6.73   
4  InuYasha Movie 2: Kagami no Naka no Mugenjo   7.66   
5               Mobile Suit Gundam-san (Movie)   5.82   
6                         Aguu: Tensai Ningyou   5.48   

                                              Genres  hybrid_score  
0                      Action, Comedy, Mecha, Sci-Fi      0.538597  
1  Action, Adventure, Fantasy, Magic, Romance, Su...      0.532134  
2  Action, Comedy, Mecha, Military, Parody, Roman...      0.531687  
3                             Action, Drama, Shounen      0.516624  
4  Action, Adventure, Comedy, Historical, Demons,...      0.504437  
5                              Comedy, Parody, Mecha      0.5

In [11]:
def hybrid_recommend_by_favorite(Names, k=5, alpha=0.5):
    """
    For each favorite anime, recommend k new ones grouped in sections.
    """
    results_dict = {}

    for fav in Names:
        # --- 1) Find index of favorite
        match = find_closest_title(fav, df["Name"].tolist())
        if not match:
            continue
        fav_idx = Name_to_idx[match]

        # --- 2) Build vector for this single anime
        fav_vec = embeddings[fav_idx].reshape(1, -1)
        faiss.normalize_L2(fav_vec)

        # --- 3) Semantic neighbors
        scores, indices = index.search(fav_vec, k+30)  # get extra for filtering
        indices, scores = indices[0], scores[0]

        section_results = []
        for idx, s in zip(indices, scores):
            if idx == fav_idx:
                continue

            # --- 4) Metadata similarity against this favorite
            meta_sim = metadata_similarity(idx, fav_idx)

            # --- 5) Hybrid score
            hybrid_score = alpha * s + (1 - alpha) * meta_sim
            section_results.append((idx, hybrid_score))

        # --- 6) Sort & take top k
        section_results = sorted(section_results, key=lambda x: x[1], reverse=True)[:k]

        rec_df = df.iloc[[r[0] for r in section_results]][["Name", "Score", "Genres"]].copy()
        rec_df["hybrid_score"] = [r[1] for r in section_results]

        results_dict[fav] = rec_df.reset_index(drop=True)

    return results_dict


In [12]:
user_likes = ["Naruto", "Attack on Titan"]

sections = hybrid_recommend_by_favorite(user_likes, k=5, alpha=0.6)

for fav, recs in sections.items():
    print(f"\nBecause you liked {fav}:")
    print(recs)



Because you liked Naruto:
                                      Name  Score  \
0                   Kurogane Communication   6.66   
1                                  Berserk   6.39   
2                    Kidou Senkan Nadesico   7.52   
3  Tennis no Ouji-sama: Zenkoku Taikai-hen   7.95   
4      Uta no☆Prince-sama♪ Maji Love 1000%   7.10   

                                              Genres  hybrid_score  
0                   Action, Adventure, Drama, Sci-Fi      0.618685  
1  Action, Adventure, Demons, Drama, Fantasy, Hor...      0.614661  
2  Action, Comedy, Mecha, Military, Parody, Roman...      0.604910  
3                    Action, Comedy, Sports, Shounen      0.601114  
4      Harem, Music, Comedy, Romance, School, Shoujo      0.586368  

Because you liked Attack on Titan:
                        Name  Score                                 Genres  \
0  Nurarihyon no Mago Recaps   6.37  Action, Supernatural, Demons, Shounen   
1      Ichigo 100% Special 2   6.88        Comed

In [13]:
def hybrid_recommend_by_each(titles, k=5, alpha=0.5):
    """
    Recommend anime separately for each liked anime title.
    Returns a dictionary: {liked_title: recommendations_df}
    """
    sections = {}
    
    for liked_title in titles:
        # Match to dataset index
        match = find_closest_title(liked_title, df["Name"].tolist())
        if not match:
            continue
        liked_idx = Name_to_idx[match]
        
        # Get embedding for this anime
        anime_vec = embeddings[liked_idx:liked_idx+1]
        faiss.normalize_L2(anime_vec)

        # FAISS search
        scores, indices = index.search(anime_vec, k+50)
        indices, scores = indices[0], scores[0]
        
        results = []
        for idx, s in zip(indices, scores):
            if idx == liked_idx:  # skip the anime itself
                continue
            
            # Metadata similarity (to just this anime)
            meta_sim = metadata_similarity(idx, liked_idx)
            
            # Hybrid score
            hybrid_score = alpha * s + (1 - alpha) * meta_sim
            results.append((idx, hybrid_score))
        
        # Rank top-k for this anime
        results = sorted(results, key=lambda x: x[1], reverse=True)[:k]
        
        rec_df = df.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
        rec_df["hybrid_score"] = [r[1] for r in results]
        
        sections[liked_title] = rec_df.reset_index(drop=True)
    
    return sections


In [14]:
user_likes = ["Naruto", "Attack on Titan", "One Piece"]

sections = hybrid_recommend_by_each(user_likes, k=5, alpha=0.6)

for liked, recs in sections.items():
    print(f"\nBecause you liked {liked}:\n")
    print(recs)



Because you liked Naruto:

                                      Name  Score  \
0                   Kurogane Communication   6.66   
1                                  Berserk   6.39   
2                    Kidou Senkan Nadesico   7.52   
3                                 InuYasha   7.85   
4  Tennis no Ouji-sama: Zenkoku Taikai-hen   7.95   

                                              Genres  hybrid_score  
0                   Action, Adventure, Drama, Sci-Fi      0.618685  
1  Action, Adventure, Demons, Drama, Fantasy, Hor...      0.614661  
2  Action, Comedy, Mecha, Military, Parody, Roman...      0.604910  
3  Action, Adventure, Comedy, Historical, Demons,...      0.602093  
4                    Action, Comedy, Sports, Shounen      0.601114  

Because you liked Attack on Titan:

                        Name  Score                                 Genres  \
0  Nurarihyon no Mago Recaps   6.37  Action, Supernatural, Demons, Shounen   
1   Shikabane Hime: Puchitto   6.19           

In [None]:
def hybrid_recommend_by_each(
    Names, 
    k=5, 
    alpha=0.5, 
    mode="hybrid",  # "faiss", "metadata", or "hybrid"
    df_anime_sypnosis=None, 
    df_anime=None,
    embeddings = None,
    index =None,
    Name_to_idx = None
):
    """
    Flexible recommender:
      - mode="faiss"    → Synopsis-only
      - mode="metadata" → Metadata-only
      - mode="hybrid"   → Combine FAISS + metadata
    """


    sections = {}

    for liked_title in Names:
        # Find index
        if liked_title not in df_anime["English name"].values and liked_title not in df_anime["Name"].values:
            print(f"Dont have {liked_title} in the dataset")
            continue
        liked_idx = Name_to_idx[liked_title]

        # -----------------------
        # MODE 1: Metadata only
        # -----------------------
        if mode == "metadata":
            results = []
            for idx, _ in df_anime_sypnosis.iterrows():
                if idx == liked_idx:
                    continue
                meta_score, sims_detail = metadata_similarity(df_anime.iloc[liked_idx], df_anime.iloc[idx])
                results.append((idx, meta_score, sims_detail))

            results = sorted(results, key=lambda x: x[1], reverse=True)[:k]

            rec_df = df_anime_sypnosis.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
            rec_df["similarity"] = [r[1] for r in results]
            rec_df["explanation"] = [
                ", ".join(f"{k}:{v:.2f}" for k, v in r[2].items() if v > 0) for r in results
            ]
            sections[liked_title] = rec_df.reset_index(drop=True)
            continue

        # Get FAISS neighbors (used in both faiss-only and hybrid)
        anime_vec = embeddings[liked_idx:liked_idx+1]
        faiss.normalize_L2(anime_vec)
        scores, indices = index.search(anime_vec, k+50)
        indices, scores = indices[0], scores[0]

        # -----------------------
        # MODE 2: FAISS only
        # -----------------------
        if mode == "faiss":
            results = []
            for idx, s in zip(indices, scores):
                if idx == liked_idx:
                    continue
                results.append((idx, float(s)))

            results = sorted(results, key=lambda x: x[1], reverse=True)[:k]

            rec_df = df_anime_sypnosis.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
            rec_df["similarity"] = [r[1] for r in results]
            sections[liked_title] = rec_df.reset_index(drop=True)
            continue

        # -----------------------
        # MODE 3: Hybrid
        # -----------------------
        results = []
        for idx, s in zip(indices, scores):
            if idx == liked_idx:
                continue


            meta_score, sims_detail = metadata_similarity(df_anime.iloc[liked_idx], df_anime.iloc[idx])

            print(meta_score, s, alpha)
            hybrid_score = alpha * s + (1 - alpha) * meta_score
            print(hybrid_score)
            results.append((idx, hybrid_score, sims_detail))

        results = sorted(results, key=lambda x: x[1], reverse=True)[:k]

        rec_df = df_anime_sypnosis.iloc[[r[0] for r in results]][["Name", "Score", "Genres"]].copy()
        rec_df["hybrid_score"] = [r[1] for r in results]
        rec_df["explanation"] = [
            ", ".join(f"{k}:{v:.2f}" for k, v in r[2].items() if v > 0) for r in results
        ]
        sections[liked_title] = rec_df.reset_index(drop=True)

    return sections

In [1]:
hybrid_recommend_by_each(df_anime= df_anime, df_anime_sypnosis= df_anime_sypnosis, Names = user_likes, k=15, alpha=0.5, mode ="hybrid", embeddings = embeddings, index = index, Name_to_idx = Name_to_idx)


NameError: name 'hybrid_recommend_by_each' is not defined