In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the data set 
data = pd.read_csv('/DataScienceNotes/Assignments/16-Recommendation System/anime.csv')

In [3]:
# checking the data
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# get information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


data is haaving both categorical and numerical columns also having null values

In [5]:
# episodes column seems as categorical
data['episodes'].unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       'Unknown', '39', '101', '47', '50', '62', '33', '112', '23', '3',
       '94', '6', '8', '14', '7', '40', '15', '203', '77', '291', '120',
       '102', '96', '38', '79', '175', '103', '70', '153', '45', '5',
       '21', '63', '52', '28', '145', '36', '69', '60', '178', '114',
       '35', '61', '34', '109', '20', '9', '49', '366', '97', '48', '78',
       '358', '155', '104', '113', '54', '167', '161', '42', '142', '31',
       '373', '220', '46', '195', '17', '1787', '73', '147', '127', '16',
       '19', '98', '150', '76', '53', '124', '29', '115', '224', '44',
       '58', '93', '154', '92', '67', '172', '86', '30', '276', '59',
       '72', '330', '41', '105', '128', '137', '56', '55', '65', '243',
       '193', '18', '191', '180', '91', '192', '66', '182', '32', '164',
       '100', '296', '694', '95', '68', '117', '151', '130',

numbers are stored as string and the 'Unkown' is making etier column as categorical

In [6]:
# repalce unkown
data['episodes'] = data['episodes'].replace('Unknown', np.nan)

In [7]:
# making it numerical
data['episodes'] = pd.to_numeric(data['episodes'])

In [8]:
# checking data types
data['episodes'].dtypes

dtype('float64')

In [9]:
# checking missing values
data.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64

In [10]:
# computing missing values
data['episodes'].fillna(data['episodes'].median(), inplace =True)
data['episodes'] = data['episodes'].astype('int64')

In [11]:
data['genre'].fillna('Unknown', inplace=True)

In [12]:
data['type'].fillna('Unknown', inplace=True)

In [13]:
data['rating'].fillna(data['rating'].median(), inplace= True)

In [14]:
# feature selection for recomdation
# converting genre to numerical using tf-idf better than on-hot encoding
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
genre_tfidf = tfidf.fit_transform(data['genre'])

In [15]:
# normilizing the numerical features, episode, rating & members
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_features = scaler.fit_transform(data[['episodes', 'rating', 'members']])

In [16]:
# combining all the features 
from scipy.sparse import hstack
final_features = hstack([genre_tfidf, num_features])

In [17]:
# finding cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(final_features)
cosine_sim

array([[1.        , 0.53222152, 0.46238363, ..., 0.24157137, 0.24807781,
        0.27820617],
       [0.53222152, 1.        , 0.51704586, ..., 0.20970181, 0.21529733,
        0.24142514],
       [0.46238363, 0.51704586, 1.        , ..., 0.24116795, 0.24763425,
        0.27771187],
       ...,
       [0.24157137, 0.20970181, 0.24116795, ..., 1.        , 0.99994463,
        0.99824866],
       [0.24807781, 0.21529733, 0.24763425, ..., 0.99994463, 1.        ,
        0.99881138],
       [0.27820617, 0.24142514, 0.27771187, ..., 0.99824866, 0.99881138,
        1.        ]])

In [18]:
# recomdation function

def recommend_anime(title, data, cosine_sim, top_n=10, similarity_threshold=0.3):
    
    if title not in data['name'].values:
        return "Anime not found"
    
    idx = data[data['name'] == title].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    similarity_scores = sorted(
        similarity_scores, key=lambda x: x[1], reverse=True
    )
    
    filtered_scores = [
        (i, score) for i, score in similarity_scores[1:]
        if score >= similarity_threshold
    ]
    
    top_indices = [i for i, _ in filtered_scores[:top_n]]
    
    return data.loc[top_indices, ['name', 'genre', 'rating', 'episodes']]


In [19]:
# checking with different threshold value
recommend_anime("Naruto", data, cosine_sim, top_n=5, similarity_threshold=0.3)

Unnamed: 0,name,genre,rating,episodes
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,2
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,291
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,153
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53,1
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5,1


In [20]:
recommend_anime("Naruto", data, cosine_sim, top_n=10, similarity_threshold=0.2)

Unnamed: 0,name,genre,rating,episodes
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,2
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,291
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,153
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53,1
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5,1
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03,1
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58,1
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.11,1
588,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95,97
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68,1


In [21]:
recommend_anime("Naruto", data, cosine_sim, top_n=10, similarity_threshold=0.4)

Unnamed: 0,name,genre,rating,episodes
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,2
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,291
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,153
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53,1
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5,1
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03,1
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58,1
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.11,1
588,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95,97
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68,1


In [22]:
recommend_anime("Naruto", data, cosine_sim, top_n=10, similarity_threshold=0.6)

Unnamed: 0,name,genre,rating,episodes
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94,2
206,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",8.32,291
346,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",8.16,153
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53,1
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.5,1
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03,1
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58,1
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.11,1
588,Dragon Ball Kai,"Action, Adventure, Comedy, Fantasy, Martial Ar...",7.95,97
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68,1


User-Based Collaborative Filtering

User-based collaborative filtering recommends items by finding users with similar preferences.

Item-based collaborative filtering recommends items by finding items similar to those the user already liked.

Collaborative Filtering is a recommendation technique that predicts a user’s interests by learning from interactions of many users instead of relying 

on item content.