In [1]:
# Step 1: Load and Explore the Dataset
import pandas as pd

In [2]:
# Load the dataset
from google.colab import files
uploaded = files.upload()

Saving anime.csv to anime.csv


In [3]:
df = pd.read_csv('anime.csv')

In [4]:
# Display basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [5]:
# Show first few rows
print(df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [6]:
# Check for missing values
print(df.isnull().sum())


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [7]:
# Step 2: Handle Missing Values
# Fill missing numerical values with mean
df.fillna(df.mean(numeric_only=True), inplace=True)


In [8]:
# Fill missing categorical values with mode
df.fillna(df.mode().iloc[0], inplace=True)

In [9]:
# Step 3: Feature Selection
# Select important columns
df = df[['anime_id', 'name', 'genre', 'rating', 'members']]

In [10]:
# Step 4: Convert Categorical Features into Numeric
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert genre column into numerical representation
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df['genre'].astype(str))

In [11]:
# Step 5: Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)


In [14]:
# Step 6: Build the Recommendation Function
def recommend_anime(title, df, cosine_sim):
    # Get index of the anime
    idx = df[df['name'] == title].index[0]

    # Get similarity scores for all anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

     # Get top 5 similar anime
    sim_scores = sim_scores[1:6]

    # Get anime names
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices]

# Example usage
print(recommend_anime("Naruto", df, cosine_sim))

615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [15]:
# Step 7: Evaluation
# To evaluate the system, we can use precision, recall, and F1-score.
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split dataset (not needed for content-based, but useful for collaborative filtering)
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Placeholder for evaluation (we need user ratings for proper evaluation)
y_true = [1] * len(test)  # Assuming all recommendations are relevant (for simplicity)
y_pred = [1 if score >= 0.5 else 0 for score in cosine_sim[0][:len(test)]]

# Compute precision, recall, and F1-score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Precision: 1.0, Recall: 0.058560390402602684, F1-score: 0.11064156742220514


In [None]:
# Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?
# Ans 1: Feature:	  User-Based Collaborative Filtering:             	                                                        Item-Based Collaborative Filtering:
# Definition:	      Finds similar users and recommends items they liked.	                                                    Finds similar items and recommends them to users.
# Approach:	        "People similar to you liked this, so you might too."	                                                    "You liked this item, so you might like similar items."
# Similarity Basis:	User behavior, past interactions, and ratings.	                                                          Item characteristics and how users interact with them.
# Example:	        If two users have watched the same anime, one user's highly-rated anime will be recommended to the other. If a user liked "Naruto," they may also like "Bleach" because both share common features.
# Computational Complexity:	High for large datasets due to dynamic user preferences.	                                        More stable and computationally efficient.


In [None]:
# 2. What is collaborative filtering, and how does it work?
# Definition:
# Collaborative filtering is a recommendation system technique that predicts a user's preferences by collecting preferences from multiple users.
# How It Works:
# It assumes that:

# Users who have agreed in the past will agree again in the future.
# Items liked by similar users will be liked by the target user.
