# Content Based Recommender system

In [1]:
# In this recommender system the content of the movie 
# (overview, cast, crew, keyword, tagline etc) is used to find its similarity with other movies. 
# Then the movies that are most likely to be similar are recommended.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
animes = pd.read_csv('clean_data/animes.csv')

In [4]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link,release_year,is_ongoing
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,2015.0,0
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...,2014.0,0
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss,2017.0,0
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,2009.0,0
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...,2017.0,0


In [5]:
# drop duplicate titles
animes = animes.drop_duplicates(subset=['title'])

### Plot description based Recommender

In [6]:
# We will compute pairwise similarity scores for all animes based on their plot descriptions and 
# recommend animes based on that similarity score. 
# The plot description is given in the overview feature of our dataset. Let's take a look at the data. ..

In [7]:
animes.synopsis.head()

0    Following their participation at the Inter-Hig...
1    Music accompanies the path of the human metron...
2    The Abyss—a gaping chasm stretching down into ...
3    "In order for something to be obtained, someth...
4    After helping revive the legendary vampire Kis...
Name: synopsis, dtype: object

In [8]:
# we need to convert the word vector for each description therefore we will TF-IDF VEctorizer

In [9]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
animes['synopsis'] = animes['synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(animes['synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(16214, 43912)

In [10]:
# Since we have used the TF-IDF vectorizer, 
# calculating the dot product will directly give us the cosine similarity score. 
# Therefore, we will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [11]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
# We are going to define a function that takes in animes title as an input and outputs 
# a list of the 10 most similar animes. Firstly, for this, 
# we need a reverse mapping of anime titles and DataFrame indices. In other words, 
# we need a mechanism to identify the index of an anime in our metadata DataFrame, given its title

In [13]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(animes.index, index=animes['title']).drop_duplicates()

In [14]:
# Function that takes in anime title as input and outputs most similar animes
def get_recommendations(name, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[name]

    # Get the pairwsie similarity scores of all animes with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar animes
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar animes
    return animes['title'].iloc[anime_indices]

In [15]:
get_recommendations('Made in Abyss')

905                   Pandora Hearts
383                    Tsumiki no Ie
3347            Mahoutsukai Precure!
13846                 Whistle! (ONA)
229         Jigoku Shoujo Futakomori
12985            Gundam Build Divers
6600     Gundam Build Divers Re:Rise
6247                       Plunderer
7770        Yasashii Fue, Tori, Ishi
9504                        Kokuhaku
Name: title, dtype: object

# Metricas

In [16]:
df_train = pd.read_csv(
    "train", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_train.rating = [1 if x >= 5 else 0 for x in df_train.rating]

df_train.head()

Unnamed: 0,userid,itemid,rating
0,1635,6675.0,1
1,3340,7785.0,1
2,1015,578.0,1
3,87,18617.0,1
4,5582,8425.0,1


In [17]:
df_test = pd.read_csv(
    "test", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_test.head()

Unnamed: 0,userid,itemid,rating
0,20408,8426.0,8
1,22049,5114.0,10
2,4728,4224.0,10
3,690,6746.0,9
4,254,481.0,6


In [18]:
uid_to_idx = pd.Series(range(len(animes)), index=animes['uid'])

In [19]:
print(uid_to_idx)

uid
28891        0
23273        1
34599        2
5114         3
31758        4
         ...  
10075    16209
35828    16210
10378    16211
33082    16212
16934    16213
Length: 16214, dtype: int64


In [20]:
user_items = {}
for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []
    user_items[row[1]].append(row[2])

In [21]:
def recommend(user_id, n=10):
    """Recommend top-n anime titles for a user based on content similarity."""
    seen_items = user_items.get(user_id, [])
    
    N = cosine_sim.shape[0]
    scores = np.zeros(N, dtype=float)

    for item in seen_items:
        try:
            idx = uid_to_idx.get(item)
        except KeyError:
            continue

        row = cosine_sim[idx]
        try:
            import scipy.sparse as _sps
            if _sps.issparse(row):
                row = row.toarray()
        except Exception:
            pass

        row = np.asarray(row)
        if row.ndim == 2:
            row = row.sum(axis=0)
        row = row.ravel()
        if row.shape[0] == N:
            scores += row
        else:
            continue

    # Exclude items the user has already seen
    seen_idxs = []
    for item in seen_items:
        if item in uid_to_idx.index:
            idx = uid_to_idx.get(item)
            seen_idxs.append(idx)
    if seen_idxs:
        scores[seen_idxs] = -np.inf

    top_idxs = np.argsort(scores)[::-1][:n]

    return animes["uid"].iloc[top_idxs].values

In [22]:
from evaluate import get_metrics

get_metrics(df_train, df_test, recommend, k=10, alpha=0.2)

Evaluando usuarios: 100%|██████████| 15089/15089 [00:07<00:00, 2066.32it/s]

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.02412206888814092,
  "mean_precision": 0.0026141590934589927,
  "mean_ap (MAP)": 0.011307742925884967,
  "mean_ndcg": 0.014370915071319518,
  "mean_novelty": 7.910143623465038,
  "mean_diversity": 0.7774642903529548,
  "num_users_evaluated": 8913
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.2,
  "group_averages": {
    "Female": {
      "recall": 0.01638001638001638,
      "precision": 0.001801801801801802,
      "MAP": 0.008532314782314783,
      "nDCG": 0.010441031076626639,
      "novelty": 7.771655073351902,
      "diversity": 0.7852992808674627,
      "count": 2442
    },
    "Male": {
      "recall": 0.027043733580590326,
      "precision": 0.002920723226703755,
      "MAP": 0.012355122855818267,
      "nDCG": 0.015853958915399258,
      "novelty": 7.962405876497998,
      "diversity": 0.7745075530887875,
      "count": 6471
    }
  },
  "disparity_reports": [
    {
      "pair": [
     




In [23]:
get_metrics(df_train, df_test, recommend, k=10, alpha=0.2, use_age_group=True)

Evaluando usuarios: 100%|██████████| 15089/15089 [00:05<00:00, 2523.93it/s]

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.024471783899438354,
  "mean_precision": 0.002674511901577962,
  "mean_ap (MAP)": 0.011545533565124366,
  "mean_ndcg": 0.014645553776250774,
  "mean_novelty": 7.908137872494969,
  "mean_diversity": 0.7765089598556657,
  "num_users_evaluated": 7478
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.2,
  "group_averages": {
    "25-34": {
      "recall": 0.02039274924471299,
      "precision": 0.002209214501510574,
      "MAP": 0.009195881587781134,
      "nDCG": 0.01189964953074867,
      "novelty": 7.9044865179921375,
      "diversity": 0.773434288934509,
      "count": 5296
    },
    "35-44": {
      "recall": 0.0363262508567512,
      "precision": 0.003975325565455792,
      "MAP": 0.016945722771630926,
      "nDCG": 0.021497058937134304,
      "novelty": 7.946241968932145,
      "diversity": 0.7832744604374716,
      "count": 1459
    },
    "18-24": {
      "recall": 0.03303964757709251,
      "


