In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Import the dataset
df = pd.read_csv('anime.csv',index_col=0)
df.sort_index(inplace=True)
df

In [None]:
#Check datatypes
df.dtypes

In [None]:
df['episodes'].unique()

In [None]:
df[df['episodes']=='Unknown']

In [None]:
f'{100*340/12294} % of Unknown episodes'

Since the % of Unknown episodes are less than 5%, we can drop these rows for our analysis

In [None]:
drop_indices = df[df['episodes']=='Unknown'].index

df.drop(drop_indices, axis=0, inplace=True)

In [None]:
#Change the data type of episodes column to int

df['episodes'] = df['episodes'].astype(int)
df.dtypes

In [None]:
# CHeck missing values
df.isna().sum()

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [None]:
# Check duplicate values
df.duplicated().sum()

In [None]:
df.head()

In [None]:
# Convert all the genre values into a list and encode the values to the data frame

df['genre_values'] = df['genre'].apply(lambda x: x.split(', '))

# Flatten list

genre_list = [item for sublist in df['genre_values'] for item in sublist]

In [None]:
# Convert the list into one-hot encoded columns using pd.get_dummies

df_encoded = df['genre_values'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum()

In [None]:
# Concatenate the one-hot encoded columns and the original dataframe
df = pd.concat([df, df_encoded], axis=1)
df

In [None]:
# Drop 'genre' and 'genre_values' columns
df.drop(columns=['genre','genre_values'],inplace=True)
df

In [None]:
# Set to show all columns
pd.set_option('display.max_columns', None)

#Descriptive statistics
df.describe()

In [None]:
df.head(5)

In [None]:
# Count of Type of anime

sns.countplot(data=df,x='type',hue='type')

In [None]:
# Rating distribution
sns.displot(data=df,x='rating',hue='type', height=6, aspect=1.5)

In [None]:
# Episodes distribution
sns.displot(data=df,x='episodes',hue='type', height=6, aspect=1.5)

In [None]:
# Members distribution
sns.displot(x=df['members'],hue=df['type'], height=6, aspect=1.5,kind='kde')

In [None]:
df.head(5)

# **Model building**

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
# Transform the 'episodes', 'rating', and 'members' columns
df[['episodes','rating','members']] = sc.fit_transform(df[['episodes','rating','members']])

In [None]:
df[['episodes','rating','members']]

In [None]:
# Create pivot

pivot = pd.pivot(data=df, columns='name', values='rating')

In [None]:
pivot

In [None]:
# Replace missing NaN values with 0
pivot.fillna(0,inplace=True)

In [None]:
# Calculate distances and similarities
from sklearn.metrics import pairwise_distances

In [None]:
# Find Cosine distances
pd.DataFrame(pairwise_distances(pivot,metric='cosine'))

In [None]:
# The lesser the distance between the two anime, the more they are similar
# Calculate similarity
sim = 1 - pairwise_distances(pivot, metric='cosine')
pd.DataFrame(sim)

In [None]:
# Change diagonal values to 0
np.fill_diagonal(sim,0)

In [None]:
# replace the ids with the anime_ids
similarity = pd.DataFrame(sim)
similarity.index = df.index
similarity.columns = df.index
similarity

In [None]:
# Find out similar anime
similarity.idxmax()

In [None]:
#CHeck the unique values
similarity.idxmax().unique()

In [None]:
# Add the similarity index values to the dataframe
anime_similarities = df[['name','type']]
anime_similarities['similarity'] = similarity.idxmax()
anime_similarities

In [None]:
anime_similarities.sort_values(by='similarity', ascending=False)

In [None]:
def recommend_anime(anime_id, similarity_matrix, anime_df, top_n=5):
    """
    Recommends similar animes based on cosine similarity.
    
    Parameters:
    anime_id (int): The ID of the anime to find recommendations for.
    similarity_matrix (pd.DataFrame): The matrix containing cosine similarity values between animes.
    anime_df (pd.DataFrame): The original dataframe with anime details (names and IDs).
    top_n (int): The number of similar animes to recommend (default is 5).
    
    Returns:
    recommended_animes (list): A list of top_n recommended anime names and their similarity scores.
    """
    
    # Check if the anime_id exists in the similarity matrix
    if anime_id not in similarity_matrix.index:
        return f"Anime ID {anime_id} not found in the dataset."
    
    # Get the similarity scores for the given anime
    similarity_scores = similarity_matrix[anime_id]
    
    # Sort the animes by similarity, excluding the given anime itself (similarity score = 0)
    similar_animes = similarity_scores.sort_values(ascending=False).head(top_n + 1)[1:]  # Exclude the anime itself
    
    # Retrieve the anime names from the anime_df for the similar anime IDs
    recommended_animes = []
    for similar_anime_id in similar_animes.index:
        anime_name = anime_df.loc[anime_df.index == similar_anime_id, 'name'].values[0]
        recommended_animes.append((anime_name, similar_animes[similar_anime_id]))
    
    return recommended_animes


In [None]:
recommendations = recommend_anime(22399, similarity, df, top_n=5)

for name, score in recommendations:
    print(f"Recommended Anime: {name}, Similarity Score: {score:.4f}")

In [None]:
def get_similarity_scores(similarity_matrix, anime_ids):
    """
    Retrieves the similarity scores for the anime IDs returned by idxmax.
    
    Parameters:
    similarity_matrix (pd.DataFrame): The matrix containing cosine similarity values between animes.
    anime_ids (list): A list of anime IDs returned by similarity.idxmax().unique().
    
    Returns:
    similarity_scores (list of tuples): A list of tuples where each tuple contains (anime_id, max_similar_anime_id, similarity_score).
    """
    similarity_scores = []
    
    # Loop through each anime ID and find the similarity score with its most similar anime
    for anime_id in anime_ids:
        # Get the most similar anime ID for the current anime_id
        most_similar_anime_id = similarity_matrix[anime_id].idxmax()
        
        # Get the similarity score between the anime_id and its most similar anime
        similarity_score = similarity_matrix.loc[anime_id, most_similar_anime_id]
        
        # Append the result as a tuple (anime_id, most_similar_anime_id, similarity_score)
        similarity_scores.append((anime_id, most_similar_anime_id, similarity_score))
    
    return similarity_scores


In [None]:
similarity_scores = get_similarity_scores(similarity, similarity.idxmax().unique())

for anime_id, most_similar_anime_id, score in similarity_scores:
    print(f"Anime ID {anime_id} is most similar to Anime ID {most_similar_anime_id} with a similarity score of {score:.4f}")

Interview Questions:
1 - Can you explain the difference between user-based and item-based collaborative filtering?
User-based collaborative filtering:
a) Compares users with each other, by assuming users who have similar preferences in the past will have similar preferences in the future.
b) Recommeds new items that similar users have liked.
c) Useful in situations where users have diverse preferences.
d) Performance decreases with large number of users, and if users have rated very few items, then its hard to find similarities between them.

Item-based collaborative filtering:
a) Compares items instead of users.
b) Items that are rated similarly by users are likely to be related.
c) Better scaling with large number of users because the item space is generally smaller than the users.
d) More stable than user-based filtering.
d) If users haven't interacted with enough items, it becomes harder to find good items.

2 - What is collaborative filtering, and how does it work?
Collaborative filtering is a popular technique used in recommendation systems to predict user preferences by leveraging the preferences of other users. It works on the principle that if users have had similar tastes in the past, they will continue to have similar preferences in the future. Similarly, items that users rate or interact with similarly are considered related.
How it works-
a) User Interaction Data: Collaborative filtering starts with a user-item interaction matrix, where rows represent users and columns represent items (such as movies, products, or animes). The values in the matrix are typically ratings, but can also be implicit feedback (e.g., clicks, purchases, or views).
b) Types of Collaborative Filtering: Collaborative filtering can be broadly categorized into two main types:User-Based Collaborative Filtering and Item-Based Collaborative Filtering
c) Calculate the similarity of items or users using Cosine similarity or Pearson's correlation.
d) Recommend users similar items or highly rated items using a set threshold value of similarity.