# Content Based Recommender system

In [43]:
# In this recommender system the content of the movie 
# (overview, cast, crew, keyword, tagline etc) is used to find its similarity with other movies. 
# Then the movies that are most likely to be similar are recommended.

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [45]:
animes = pd.read_csv('clean_data/animes.csv')

In [46]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [47]:
# drop duplicate titles
animes = animes.drop_duplicates(subset=['title'])

### Plot description based Recommender

In [48]:
# We will compute pairwise similarity scores for all animes based on their plot descriptions and 
# recommend animes based on that similarity score. 
# The plot description is given in the overview feature of our dataset. Let's take a look at the data. ..

In [49]:
animes.synopsis.head()

0    Following their participation at the Inter-Hig...
1    Music accompanies the path of the human metron...
2    The Abyss—a gaping chasm stretching down into ...
3    "In order for something to be obtained, someth...
4    After helping revive the legendary vampire Kis...
Name: synopsis, dtype: object

In [50]:
# we need to convert the word vector for each description therefore we will TF-IDF VEctorizer

In [51]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
animes['synopsis'] = animes['synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(animes['synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(16214, 43912)

In [52]:
# Since we have used the TF-IDF vectorizer, 
# calculating the dot product will directly give us the cosine similarity score. 
# Therefore, we will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [53]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [54]:
# We are going to define a function that takes in animes title as an input and outputs 
# a list of the 10 most similar animes. Firstly, for this, 
# we need a reverse mapping of anime titles and DataFrame indices. In other words, 
# we need a mechanism to identify the index of an anime in our metadata DataFrame, given its title

In [55]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(animes.index, index=animes['title']).drop_duplicates()

In [56]:
# Function that takes in anime title as input and outputs most similar animes
def get_recommendations(name, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[name]

    # Get the pairwsie similarity scores of all animes with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar animes
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar animes
    return animes['title'].iloc[anime_indices]

In [57]:
get_recommendations('Made in Abyss')

905                   Pandora Hearts
383                    Tsumiki no Ie
3347            Mahoutsukai Precure!
13846                 Whistle! (ONA)
229         Jigoku Shoujo Futakomori
12985            Gundam Build Divers
6600     Gundam Build Divers Re:Rise
6247                       Plunderer
7770        Yasashii Fue, Tori, Ishi
9504                        Kokuhaku
Name: title, dtype: object

# Metricas

In [58]:
df_train = pd.read_csv(
    "train", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_train.rating = [1 if x >= 5 else 0 for x in df_train.rating]

df_train.head()

Unnamed: 0,userid,itemid,rating
0,14179,13601.0,1
1,37548,34300.0,1
2,796,2592.0,0
3,3041,949.0,1
4,2493,5114.0,1


In [59]:
df_test = pd.read_csv(
    "test", sep=",", names=["userid", "itemid", "rating"], header=None
)

df_test.head()

Unnamed: 0,userid,itemid,rating
0,506,37517.0,10
1,16392,7311.0,9
2,553,12471.0,5
3,13348,8937.0,6
4,276,35997.0,3


In [60]:
uid_to_idx = pd.Series(range(len(animes)), index=animes['uid'])

In [61]:
print(uid_to_idx)

uid
28891        0
23273        1
34599        2
5114         3
31758        4
         ...  
10075    16209
35828    16210
10378    16211
33082    16212
16934    16213
Length: 16214, dtype: int64


In [62]:
user_items = {}
for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []
    user_items[row[1]].append(row[2])

In [63]:
def recommend(user_id, n=10):
    """Recommend top-n anime titles for a user based on content similarity."""
    seen_items = user_items.get(user_id, [])
    
    N = cosine_sim.shape[0]
    scores = np.zeros(N, dtype=float)

    for item in seen_items:
        try:
            idx = uid_to_idx.get(item)
        except KeyError:
            continue

        row = cosine_sim[idx]
        try:
            import scipy.sparse as _sps
            if _sps.issparse(row):
                row = row.toarray()
        except Exception:
            pass

        row = np.asarray(row)
        if row.ndim == 2:
            row = row.sum(axis=0)
        row = row.ravel()
        if row.shape[0] == N:
            scores += row
        else:
            continue

    # Exclude items the user has already seen
    seen_idxs = []
    for item in seen_items:
        if item in uid_to_idx.index:
            idx = uid_to_idx.get(item)
            seen_idxs.append(idx)
    if seen_idxs:
        scores[seen_idxs] = -np.inf

    top_idxs = np.argsort(scores)[::-1][:n]

    return animes["uid"].iloc[top_idxs].values

In [64]:
from evaluate import get_metrics

get_metrics(df_train, df_test, recommend, k=10, alpha=0.2)

Evaluando usuarios: 100%|██████████| 18591/18591 [00:08<00:00, 2067.51it/s]

--- Métricas Globales de Evaluación ---
{
  "mean_recall": 0.018764589692943076,
  "mean_precision": 0.002011133057999641,
  "mean_ap (MAP)": 0.008310727325586366,
  "mean_ndcg": 0.010824530317495812,
  "mean_novelty": 7.961710822273875,
  "mean_diversity": 0.7809621489942612,
  "num_users_evaluated": 11138
}

--- Reporte de Fairness (Disparidad de Grupo) ---
{
  "delta_threshold": 0.2,
  "group_averages": {
    "Male": {
      "recall": 0.02057256165571942,
      "precision": 0.0022059734787458173,
      "MAP": 0.009053584461007933,
      "nDCG": 0.011823883082522118,
      "novelty": 7.993309908435526,
      "diversity": 0.7782443567465461,
      "count": 8069
    },
    "Female": {
      "recall": 0.01401107852720756,
      "precision": 0.0014988595633756926,
      "MAP": 0.0063576109275034,
      "nDCG": 0.008197036846985131,
      "novelty": 7.878630657321656,
      "diversity": 0.7881077552656244,
      "count": 3069
    }
  },
  "disparity_reports": [
    {
      "pair": [
     


