# **Recommendation System using Cosine Similarity on Anime Dataset**

# Data Preprocessing

In [2]:
# Load the dataset
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('anime.csv')

Saving anime.csv to anime (2).csv


In [3]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('anime.csv')

# Handle missing values
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Fill missing values in numerical columns with the mean
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill missing values in categorical columns with the mode
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Explore the dataset
print("\nDisplaying the first few rows")
print(df.head())  # Display the first few rows
print("\nDisplaying dataset information")
print(df.info())  # Display dataset information
print("\nDisplaying summary statistics")
print(df.describe())  # Display summary statistics


Displaying the first few rows
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  

Displaying dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data co

In [5]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [6]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the genre feature
genre_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Correctly access the 'genre' column within the DataFrame
genre_onehot = genre_encoder.fit_transform(df[['genre']])

# Concatenate the one-hot encoded genre feature with the rating feature
features = pd.concat([pd.DataFrame(genre_onehot), df[['rating']]], axis=1)
features.index = df['anime_id']

#Step 3: Recommendation System

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime_id, num_recommendations=5, threshold=0.5):
    # Get the feature vector for the target anime
    target_features = features.loc[target_anime_id]

    # Convert target_features to a NumPy array and reshape
    target_features = target_features.values.reshape(1, -1)

    # Compute cosine similarity scores with all anime
    similarity_scores = cosine_similarity(target_features, features)

    # Get the indices of the top-N similar anime
    top_n_indices = np.argsort(-similarity_scores[0])[:num_recommendations]

    # Get a boolean array indicating anime above the threshold
    above_threshold = similarity_scores[0][top_n_indices] > threshold

    # Filter the top_n_indices based on the threshold
    filtered_indices = top_n_indices[above_threshold]

    # Use 'anime_id' index to filter the DataFrame with the filtered indices
    recommended_anime = df[df['anime_id'].isin(features.index[filtered_indices])]

    return recommended_anime

# Step 4: Evaluation

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract the actual DataFrame from the nested structure (if needed)
df = df[0][0] if isinstance(df, list) and isinstance(df[0], list) and isinstance(df[0][0], pd.DataFrame) else df
# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def evaluate_recommendation_system():
    actual_genres, recommended_genres = [], []
    test_anime_ids = test_df['anime_id'].values
    # Precompute similarity scores for test anime against all anime
    similarity_scores = cosine_similarity(features.loc[test_anime_ids], features)

    for i, target_anime_id in enumerate(test_anime_ids):
        anime_similarity_scores = similarity_scores[i]
        # Get indices of recommended anime above threshold (0.5)
        filtered_indices = np.argsort(-anime_similarity_scores)[:5][anime_similarity_scores[np.argsort(-anime_similarity_scores)[:5]] > 0.5]
        recommended_anime = df[df['anime_id'].isin(features.index[filtered_indices])]

        if recommended_anime.empty:
            continue

        target_genres_set = set(test_df.iloc[i]['genre'].split(','))
        recommended_genres_set = set(recommended_anime['genre'].str.cat(sep=',').split(','))

        # Compare actual and recommended genres for precision/recall calculation
        actual_genres.extend([1 if genre in target_genres_set else 0 for genre in recommended_genres_set])
        recommended_genres.extend([1] * len(recommended_genres_set))

    # Calculate and print evaluation metrics
    precision = precision_score(actual_genres, recommended_genres, zero_division=0)
    recall = recall_score(actual_genres, recommended_genres, zero_division=0)
    f1 = f1_score(actual_genres, recommended_genres, zero_division=0)

    print(f'Precision: {precision:.3f}\nRecall: {recall:.3f}\nF1-score: {f1:.3f}')

evaluate_recommendation_system()

Precision: 0.611
Recall: 1.000
F1-score: 0.759


In [9]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

#Collaborative filtering methods(UBCF,IBCF)

In [10]:
!pip install scikit-surprise --quiet

from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming 'df' contains 'anime_id' and other columns (but no user_id or rating)

# 1. Generate synthetic user-item interactions
num_users = 100
num_ratings_per_user = 10
ratings_data = []
for user_id in range(1, num_users + 1):
    anime_ids = df['anime_id'].sample(num_ratings_per_user, replace=False).values
    ratings = np.random.randint(1, 11, size=num_ratings_per_user)
    ratings_data.extend([[user_id, anime_id, rating] for anime_id, rating in zip(anime_ids, ratings)])

ratings_df = pd.DataFrame(ratings_data, columns=['user_id', 'anime_id', 'rating'])

# 2. Prepare data for Surprise
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_df[['user_id', 'anime_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

# 3. Train and evaluate collaborative filtering models
for algo_name, sim_options in [('UBCF', {'name': 'cosine', 'user_based': True}), ('IBCF', {'name': 'cosine', 'user_based': False})]:

    algo = KNNBasic(sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    print(f"{algo_name} RMSE: {rmse}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.8341
UBCF RMSE: 2.8341002883533326
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.8341
IBCF RMSE: 2.8341002883533326


In [11]:
ratings_df.columns

Index(['user_id', 'anime_id', 'rating'], dtype='object')

# IMPROVEMENTS
Addressing Data Sparsity:

**Dimensionality Reduction:** Apply Principal Component Analysis (PCA) to the features DataFrame to reduce the number of features while preserving important information. This can mitigate the impact of sparse one-hot encoded genre data.

# **Interview Questions**

**1. Can you explain the difference between user-based and item-based collaborative filtering?**

**User-Based:** Finds similar users and recommends items they like.

**Item-Based:** Finds similar items and recommends them to users who like those items.


---
**2. What is collaborative filtering, and how does it work?**

**Definition:** Predicts user preferences based on other users' behavior.

**Steps:**

1.Collect user behavior data.

2.Preprocess data.

3.Build a model.

4.Train the model.

5.Make predictions.

5.Generate recommendations.
