In [1]:
# Data Preprocessing:

# Load the dataset into a suitable data structure 
import pandas as pd
df=pd.read_csv("anime-11.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [2]:
# Explore the dataset to understand its structure and attributes.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [3]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [4]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
# Handle missing values, if any
# Fill missing values
df['genre'] = df['genre'].fillna(df['genre'].mode()[0])
df['type'] = df['type'].fillna(df['type'].mode()[0])
df['rating'] = df['rating'].fillna(df['rating'].mean())
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [12]:
# Feature Extraction
# Decide on the features that will be used for computing similarity
selected_features = ['anime_id', 'name', 'genre','type','episodes','rating','members'] 

# Create a new DataFrame with only the selected features
features_df = df[selected_features]

# Now you can use 'features_df' for similarity calculations
print(features_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  \
0               Drama, Romance, School, Supernatural  Movie        1   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51   
3                                   Sci-Fi, Thriller     TV       24   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51   

     rating   members  
0  0.924370  0.197872  
1  0.911164  0.782770  
2  0.909964  0.112689  
3  0.900360  0.664325  
4  0.899160  0.149186  


In [13]:
# Convert categorical features into numerical representations if necessary.
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example DataFrame
data = {'category': ['anime_id', 'name', 'genre','type','episodes','rating','members']}
df = pd.DataFrame(data)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder on the 'category' column
df['category_encoded'] = label_encoder.fit_transform(df['category'])

print(df)


   category  category_encoded
0  anime_id                 0
1      name                 4
2     genre                 2
3      type                 6
4  episodes                 1
5    rating                 5
6   members                 3


In [14]:
# Normalize numerical features if required.
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the data
df = pd.read_csv('anime-11.csv')

# Convert the 'episodes' column to numeric, setting errors='coerce' to handle non-numeric values as NaN
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')

# Impute missing values in the 'episodes' column with the median
median_episodes = df['episodes'].median()
df['episodes'].fillna(median_episodes, inplace=True)

# Initialize MinMaxScaler for normalization
scaler = MinMaxScaler()

# Select columns to normalize
columns_to_normalize = ['rating', 'members', 'episodes']

# Apply MinMaxScaler to the selected columns
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Display the first few rows of the normalized DataFrame
print(df.head())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  \
0               Drama, Romance, School, Supernatural  Movie  0.000000   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV  0.034673   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV  0.027518   
3                                   Sci-Fi, Thriller     TV  0.012658   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV  0.027518   

     rating   members  
0  0.924370  0.197872  
1  0.911164  0.782770  
2  0.909964  0.112689  
3  0.900360  0.664325  
4  0.899160  0.149186  


In [15]:
#Recommendation System:

#Design a function to recommend anime based on cosine similarity.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_data(df):
    # Fill missing values
    df['genre'].fillna('', inplace=True)
    df['type'].fillna('Unknown', inplace=True)
    
    # Convert episodes to numeric and handle non-numeric values
    df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
    df['episodes'].fillna(df['episodes'].median(), inplace=True)
    
    return df

def vectorize_features(df):
    # Combine relevant features into a single string
    df['combined_features'] = df['genre'] + ' ' + df['type']
    
    # Use CountVectorizer to convert text to a matrix of token counts
    vectorizer = CountVectorizer().fit_transform(df['combined_features'])
    
    return vectorizer

def compute_cosine_similarity(matrix):
    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(matrix)
    
    return cosine_sim

def get_recommendations(anime_name, df, cosine_sim):
    # Get the index of the anime that matches the title
    idx = df[df['name'] == anime_name].index[0]
    
    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the 10 most similar animes
    sim_indices = [i[0] for i in sim_scores[1:11]]
    
    # Return the top 10 most similar animes
    return df['name'].iloc[sim_indices]

# Load the data
df = pd.read_csv('anime-11.csv')

# Preprocess the data
df = preprocess_data(df)

# Vectorize the features
matrix = vectorize_features(df)

# Compute cosine similarity
cosine_sim = compute_cosine_similarity(matrix)

# Get recommendations
anime_name = "Kimi no Na wa."  # Example anime name
recommended_animes = get_recommendations(anime_name, df, cosine_sim)

print("Recommended Animes:")
print(recommended_animes)


Recommended Animes:
1111                 Aura: Maryuuin Kouga Saigo no Tatakai
208                          Kokoro ga Sakebitagatterunda.
1494                                              Harmonie
1959                                             Air Movie
60                                      Hotarubi no Mori e
1199                      &quot;Bungaku Shoujo&quot; Movie
2103                                         Clannad Movie
5805                           Wind: A Breath of Heart OVA
6394                          Wind: A Breath of Heart (TV)
11082    Suki ni Naru Sono Shunkan wo.: Kokuhaku Jikkou...
Name: name, dtype: object


In [18]:
# Given a target anime, recommend a list of similar anime based on cosine similarity scores.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_data(df):
    # Fill missing values
    df['genre'].fillna('', inplace=True)
    df['type'].fillna('Unknown', inplace=True)
    
    # Convert episodes to numeric and handle non-numeric values
    df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
    df['episodes'].fillna(df['episodes'].median(), inplace=True)
    
    return df

def vectorize_features(df):
    # Combine relevant features into a single string
    df['combined_features'] = df['genre'] + ' ' + df['type']
    
    # Use CountVectorizer to convert text to a matrix of token counts
    vectorizer = CountVectorizer().fit_transform(df['combined_features'])
    
    return vectorizer

def compute_cosine_similarity(matrix):
    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(matrix)
    
    return cosine_sim

def get_recommendations(anime_name, df, cosine_sim):
    # Get the index of the anime that matches the title
    idx = df[df['name'] == anime_name].index[0]
    
    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the 10 most similar animes
    sim_indices = [i[0] for i in sim_scores[1:11]]
    
    # Return the top 10 most similar animes
    return df['name'].iloc[sim_indices]

# Load the data
df = pd.read_csv('anime-11.csv')

# Preprocess the data
df = preprocess_data(df)

# Vectorize the features
matrix = vectorize_features(df)

# Compute cosine similarity
cosine_sim = compute_cosine_similarity(matrix)

# Example usage
anime_name = "Kimi no Na wa."  # Example anime name
recommended_animes = get_recommendations(anime_name, df, cosine_sim)

print("Recommended Animes:")
print(recommended_animes)


Recommended Animes:
1111                 Aura: Maryuuin Kouga Saigo no Tatakai
208                          Kokoro ga Sakebitagatterunda.
1494                                              Harmonie
1959                                             Air Movie
60                                      Hotarubi no Mori e
1199                      &quot;Bungaku Shoujo&quot; Movie
2103                                         Clannad Movie
5805                           Wind: A Breath of Heart OVA
6394                          Wind: A Breath of Heart (TV)
11082    Suki ni Naru Sono Shunkan wo.: Kokuhaku Jikkou...
Name: name, dtype: object


In [19]:
# Experiment with different threshold values for similarity scores to adjust the recommendation list size.

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_data(df):
    # Fill missing values
    df['genre'].fillna('', inplace=True)
    df['type'].fillna('Unknown', inplace=True)
    
    # Convert episodes to numeric and handle non-numeric values
    df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
    df['episodes'].fillna(df['episodes'].median(), inplace=True)
    
    return df

def vectorize_features(df):
    # Combine relevant features into a single string
    df['combined_features'] = df['genre'] + ' ' + df['type']
    
    # Use CountVectorizer to convert text to a matrix of token counts
    vectorizer = CountVectorizer().fit_transform(df['combined_features'])
    
    return vectorizer

def compute_cosine_similarity(matrix):
    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(matrix)
    
    return cosine_sim

def get_recommendations(anime_name, df, cosine_sim, threshold=0.0):
    # Get the index of the anime that matches the title
    idx = df[df['name'] == anime_name].index[0]
    
    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Filter the animes based on the similarity score threshold
    sim_scores = [score for score in sim_scores if score[1] > threshold and score[0] != idx]
    
    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the most similar animes
    sim_indices = [i[0] for i in sim_scores]
    
    # Return the names of the most similar animes
    return df['name'].iloc[sim_indices]

# Load the data
df = pd.read_csv('anime-11.csv')

# Preprocess the data
df = preprocess_data(df)

# Vectorize the features
matrix = vectorize_features(df)

# Compute cosine similarity
cosine_sim = compute_cosine_similarity(matrix)

# Example usage
anime_name = "Kimi no Na wa."  # Example anime name
threshold = 0.5  # Example threshold value
recommended_animes = get_recommendations(anime_name, df, cosine_sim, threshold)

print("Recommended Animes:")
print(recommended_animes)


Recommended Animes:
1111              Aura: Maryuuin Kouga Saigo no Tatakai
208                       Kokoro ga Sakebitagatterunda.
1494                                           Harmonie
1959                                          Air Movie
60                                   Hotarubi no Mori e
                              ...                      
8448                                             Dodani
8969              Indie Anibox: Selma Danbaekjil Coffee
10531        The Kabocha Wine: Nita no Aijou Monogatari
10917                                             Fuuka
10969    Detective Conan Movie 21: Deep Red Love Letter
Name: name, Length: 437, dtype: object


In [7]:
# Split the dataset into training and testing sets.
# One-hot encode the 'genre' column
genres_one_hot = df['genre'].str.get_dummies(sep=', ')

# Normalize the 'rating' and 'members' columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['rating', 'members']] = scaler.fit_transform(df[['rating', 'members']])

# Combine the one-hot encoded genres with the normalized numerical features
features = pd.concat([df[['rating', 'members']], genres_one_hot], axis=1)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_features, test_features, train_df, test_df = train_test_split(features, df, test_size=0.2, random_state=42)

# Display the size of training and testing sets
print(f"Training set size: {train_df.shape[0]}")
print(f"Testing set size: {test_df.shape[0]}")

# Compute the cosine similarity matrix on the training features
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_matrix_train = cosine_similarity(train_features)

# Display the cosine similarity matrix for the training set
print("\nCosine Similarity Matrix for Training Set:")
print(cosine_sim_matrix_train)

Training set size: 9835
Testing set size: 2459

Cosine Similarity Matrix for Training Set:
[[1.         0.11772381 0.18327672 ... 0.29529261 0.12142222 0.54515362]
 [0.11772381 1.         0.63822152 ... 0.07634241 0.37038742 0.06864127]
 [0.18327672 0.63822152 1.         ... 0.11922851 0.17953341 0.10722638]
 ...
 [0.29529261 0.07634241 0.11922851 ... 1.         0.07896427 0.56889931]
 [0.12142222 0.37038742 0.17953341 ... 0.07896427 1.         0.07101364]
 [0.54515362 0.06864127 0.10722638 ... 0.56889931 0.07101364 1.        ]]


In [10]:
# Evaluate the recommendation system using appropriate metrics such as precision, recall, and F1-score.
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(test_df):
    y_true = []
    y_pred = []
    
    for _, row in test_df.iterrows():
        true_relevant = row['relevant']
        predicted = row['predicted']
        
        if isinstance(predicted, list) and row['name'] in predicted:
            y_true.append(1 if true_relevant else 0)
            y_pred.append(1)
        else:
            y_true.append(1 if true_relevant else 0)
            y_pred.append(0)
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    return precision, recall, f1

# Example usage with a sample DataFrame
data = {
    'name': ['doc1', 'doc2', 'doc3'],
    'relevant': [True, False, True],
    'predicted': [['doc1'], ['doc1', 'doc3'], []]
}

test_df = pd.DataFrame(data)

precision, recall, f1 = calculate_metrics(test_df)

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


Precision: 1.00
Recall: 0.50
F1-score: 0.67


In [22]:
# Analyze the performance of the recommendation system and identify areas of improvement.

import numpy as np
from sklearn.metrics import precision_score, recall_score

# Simulated ground truth data (1 if user liked the anime, 0 otherwise)
user_anime_interactions = {
    'user1': {'Kimi no Na wa.': 1, 'Anime 1': 1, 'Anime 2': 0, 'Anime 3': 0},
    'user2': {'Kimi no Na wa.': 0, 'Anime 1': 0, 'Anime 2': 1, 'Anime 3': 1},
    # Add more users and their interactions
}

# Simulated recommendations for users
recommendations = {
    'user1': ['Anime 1', 'Anime 3'],
    'user2': ['Anime 2', 'Anime 3'],
    # Add more users and their recommendations
}

def evaluate_accuracy(user_anime_interactions, recommendations):
    precisions = []
    recalls = []
    
    for user, relevant_animes in user_anime_interactions.items():
        true_labels = []
        pred_labels = []
        
        for anime, liked in relevant_animes.items():
            true_labels.append(liked)
            pred_labels.append(1 if anime in recommendations[user] else 0)
        
        precisions.append(precision_score(true_labels, pred_labels))
        recalls.append(recall_score(true_labels, pred_labels))
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    
    return avg_precision, avg_recall

# Evaluate accuracy
avg_precision, avg_recall = evaluate_accuracy(user_anime_interactions, recommendations)
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
def evaluate_diversity(recommendations):
    all_recommendations = []
    for recs in recommendations.values():
        all_recommendations.extend(recs)
    
    unique_recommendations = set(all_recommendations)
    diversity = len(unique_recommendations) / len(all_recommendations)
    
    return diversity

def evaluate_novelty(recommendations, popular_items):
    novelty_scores = []
    
    for recs in recommendations.values():
        novelty = sum(1 for anime in recs if anime not in popular_items) / len(recs)
        novelty_scores.append(novelty)
    
    avg_novelty = np.mean(novelty_scores)
    
    return avg_novelty

# Simulated popular items
popular_items = ['Kimi no Na wa.', 'Anime 1']

# Evaluate diversity and novelty
diversity = evaluate_diversity(recommendations)
novelty = evaluate_novelty(recommendations, popular_items)

print(f"Diversity: {diversity}")
print(f"Novelty: {novelty}")
# Simulated user feedback (1 if user liked the recommendation, 0 otherwise)
user_feedback = {
    'user1': {'Anime 1': 1, 'Anime 3': 0},
    'user2': {'Anime 2': 1, 'Anime 3': 1},
    # Add more user feedback
}

def evaluate_user_satisfaction(user_feedback):
    satisfaction_scores = []
    
    for feedback in user_feedback.values():
        satisfaction = sum(feedback.values()) / len(feedback)
        satisfaction_scores.append(satisfaction)
    
    avg_satisfaction = np.mean(satisfaction_scores)
    
    return avg_satisfaction

# Evaluate user satisfaction
avg_satisfaction = evaluate_user_satisfaction(user_feedback)
print(f"Average User Satisfaction: {avg_satisfaction}")


Average Precision: 0.75
Average Recall: 0.75
Diversity: 0.75
Novelty: 0.75
Average User Satisfaction: 0.75


Areas of improvement are :
Feature engineering
model complexity
user feedback loop
scalability

Interview Questions:
 
# Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering:
This approach focuses on finding similarities between users. The idea is that if two users have similar preferences or behaviors,
the items liked or rated highly by one user can be recommended to the other user.

Item-based collaborative filtering:
This approach focuses on finding similarities between items. The idea is that if two items are rated similarly by many users,
they are considered similar. If a user likes one item, similar items can be recommended.

In [None]:
# What is collaborative filtering, and how does it work

Collaborative filtering is a technique used in recommendation systems to predict the preferences of a user by collecting preferences from 
many users. It works on the principle that users who agreed in the past will agree in the future and that users will prefer items that are
similar to items they liked in the past.

2 Types:
1.User-based collaborative filtering
2.Item-based collaborative filtering

1. User-Based Collaborative Filtering
This approach focuses on finding similarities between users. The idea is that if two users have similar preferences or behaviors,
the items liked or rated highly by one user can be recommended to the other.

Steps:
Similarity Calculation: Calculate the similarity between users based on their item ratings. Common similarity measures include cosine similarity, 
Pearson correlation, and Jaccard similarity.
Neighborhood Formation: Identify a set of users (neighbors) who are most similar to the target user.
Recommendation Generation: Aggregate the ratings of these similar users to predict the target user's rating for an item

2.Item-based collaborative filtering
This approach focuses on finding similarities between items. The idea is that if two items are rated similarly by many users,
they are considered similar. If a user likes one item, similar items can be recommended.

Steps:
Similarity Calculation: Calculate the similarity between items based on user ratings. Common similarity measures include cosine similarity and
adjusted cosine similarity.
Neighborhood Formation: Identify a set of items (neighbors) that are most similar to the items the target user has liked.
Recommendation Generation: Aggregate the ratings of the target user for similar items to predict their rating for a new item.