In [1]:
#Data Pre-processing
import pandas as pd
df = pd.read_csv("anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [2]:
df = df.drop_duplicates()
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
df['name'] = (df['name'].str.replace('&amp;#039;', "'", regex=False).str.replace('&amp;amp;', '&', regex=False).str.strip())
df['name']

0                                           Kimi no Na wa.
1                         Fullmetal Alchemist: Brotherhood
2                                                 Gintama°
3                                              Steins;Gate
4                                            Gintama&#039;
                               ...                        
12289         Toushindai My Lover: Minami tai Mecha-Minami
12290                                          Under World
12291                       Violence Gekiga David no Hoshi
12292    Violence Gekiga Shin David no Hoshi: Inma Dens...
12293                     Yasuji no Pornorama: Yacchimae!!
Name: name, Length: 12294, dtype: object

In [4]:
# Handle missing values 
for col in ['genre', 'type', 'episodes']:
    df[col] = df[col].fillna('Unknown')
df[col]

0         1
1        64
2        51
3        24
4        51
         ..
12289     1
12290     1
12291     4
12292     1
12293     1
Name: episodes, Length: 12294, dtype: object

In [5]:
# Quick exploration
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
#Handel Null Value
df.isna().sum()

anime_id      0
name          0
genre         0
type          0
episodes      0
rating      230
members       0
dtype: int64

In [7]:
#Feature Engineering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [8]:
# Genres → Bag-of-Words 
vectoriser = CountVectorizer(tokenizer=lambda s: [g.strip() for g in s.split(',')])
vectoriser

In [9]:
genre_bow   = vectoriser.fit_transform(df['genre'])
genre_bow 



<12294x44 sparse matrix of type '<class 'numpy.int64'>'
	with 36346 stored elements in Compressed Sparse Row format>

In [10]:
#Rating → 0-1 scaled column
scaler = MinMaxScaler()
rating_norm = scaler.fit_transform(df[['rating']].fillna(0))
rating_norm

array([[0.937],
       [0.926],
       [0.925],
       ...,
       [0.488],
       [0.498],
       [0.546]])

In [11]:
#genres + rating
from scipy import sparse
feature_mat = sparse.hstack([genre_bow, rating_norm])
feature_mat

<12294x45 sparse matrix of type '<class 'numpy.float64'>'
	with 48410 stored elements in COOrdinate format>

In [12]:
#Similarity Matrix
cosine_sim = cosine_similarity(feature_mat, dense_output=False)

In [13]:
#Recommendation Function
#helper look-ups
index_of = pd.Series(df.index, index=df['name']).to_dict()
index_of

{'Kimi no Na wa.': 0,
 'Fullmetal Alchemist: Brotherhood': 1,
 'Gintama°': 2,
 'Steins;Gate': 3,
 'Gintama&#039;': 4,
 'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou': 5,
 'Hunter x Hunter (2011)': 6,
 'Ginga Eiyuu Densetsu': 7,
 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare': 8,
 'Gintama&#039;: Enchousen': 9,
 'Clannad: After Story': 10,
 'Koe no Katachi': 11,
 'Gintama': 12,
 'Code Geass: Hangyaku no Lelouch R2': 13,
 'Haikyuu!! Second Season': 14,
 'Sen to Chihiro no Kamikakushi': 15,
 'Shigatsu wa Kimi no Uso': 16,
 'Mushishi Zoku Shou 2nd Season': 17,
 'Ookami Kodomo no Ame to Yuki': 18,
 'Code Geass: Hangyaku no Lelouch': 19,
 'Hajime no Ippo': 20,
 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen': 21,
 'Cowboy Bebop': 22,
 'One Punch Man': 23,
 'Mononoke Hime': 24,
 'Suzumiya Haruhi no Shoushitsu': 25,
 'Monogatari Series: Second Season': 26,
 'Mushishi Zoku Shou': 27,
 'Mushishi': 28,
 'Tengen Toppa Gurren Lagann': 29,
 'Great Teacher Onizuka': 30,
 '

In [14]:
titles= df['name'].tolist()
titles

['Kimi no Na wa.',
 'Fullmetal Alchemist: Brotherhood',
 'Gintama°',
 'Steins;Gate',
 'Gintama&#039;',
 'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
 'Hunter x Hunter (2011)',
 'Ginga Eiyuu Densetsu',
 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
 'Gintama&#039;: Enchousen',
 'Clannad: After Story',
 'Koe no Katachi',
 'Gintama',
 'Code Geass: Hangyaku no Lelouch R2',
 'Haikyuu!! Second Season',
 'Sen to Chihiro no Kamikakushi',
 'Shigatsu wa Kimi no Uso',
 'Mushishi Zoku Shou 2nd Season',
 'Ookami Kodomo no Ame to Yuki',
 'Code Geass: Hangyaku no Lelouch',
 'Hajime no Ippo',
 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen',
 'Cowboy Bebop',
 'One Punch Man',
 'Mononoke Hime',
 'Suzumiya Haruhi no Shoushitsu',
 'Monogatari Series: Second Season',
 'Mushishi Zoku Shou',
 'Mushishi',
 'Tengen Toppa Gurren Lagann',
 'Great Teacher Onizuka',
 'Natsume Yuujinchou Go',
 'Hajime no Ippo: New Challenger',
 'Mushishi Zoku Shou: Suzu no Shizuku',
 'Natsume Yuuji

In [15]:
def recommend(title, top_k=10, score_threshold=0.4):
    """Return top-k similar anime above threshold (excludes the query itself)."""
    idx   = index_of.get(title)
    if idx is None:                       # unknown title
        return []
    sims      = list(enumerate(cosine_sim[idx].toarray().ravel()))
    sims      = [(i, score) for i, score in sims if i != idx and score >= score_threshold]
    sims.sort(key=lambda x: x[1], reverse=True)
    return [(titles[i], round(score, 3)) for i, score in sims[:top_k]]

#Example call
recommend("Kimi no Na wa.")

[('Wind: A Breath of Heart OVA', 0.991),
 ('Wind: A Breath of Heart (TV)', 0.99),
 ('Aura: Maryuuin Kouga Saigo no Tatakai', 0.904),
 ('Kokoro ga Sakebitagatterunda.', 0.891),
 ('Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 0.89),
 ('Angel Beats!: Another Epilogue', 0.889),
 ('True Tears', 0.888),
 ('&quot;Bungaku Shoujo&quot; Memoire', 0.888),
 ('Harmonie', 0.888),
 ('Kimikiss Pure Rouge', 0.888)]

In [16]:
##Evaluation
from sklearn.model_selection import train_test_split
from collections import Counter

In [17]:
#Split on rows- 80/20
train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42)
train_mask = np.zeros(len(df), dtype=bool); train_mask[train_idx] = True
train_mask

array([False,  True,  True, ...,  True, False,  True])

In [18]:
hits, total_rec, total_possible = 0, 0, 0
for idx in test_idx:
    title      = titles[idx]
    recs       = recommend(title, top_k=5, score_threshold=0.3)
    rec_titles = {r[0] for r in recs}
    total_rec += len(recs)

    test_genres = set([g.strip() for g in df.at[idx, 'genre'].split(',')])
    for rt in rec_titles:
        if not recs: continue
        test_possible = len(test_genres)          # any shared genre counts as success
        rec_genres  = set([g.strip() for g in df.loc[index_of[rt], 'genre'].split(',')])
        if test_genres & rec_genres:
            hits += 1
    total_possible += test_possible

precision_at5 = hits / total_rec
recall_at5    = hits / total_possible
f"P@5={precision_at5:.2%}, R@5={recall_at5:.2%}"

'P@5=99.98%, R@5=166.53%'

In [19]:
##################Interview Questions:###################
###Can you explain the difference between user-based and item-based collaborative filtering?
#Ans-User-based: Finds similar users and recommends items liked by them.
#Ans-Item-based: Finds similar items and recommends those similar to what the user already liked.
###What is collaborative filtering, and how does it work?
#Collaborative Filtering (CF) is a recommendation system technique that suggests items (movies, books, products, music, etc.)
#to users based on the preferences and behaviors of other similar users.