In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
anime_df = pd.read_csv(r"D:\Excelr\Data Science\Data Science Assignment\Recommendation System\Recommendation System\anime.csv")
# Fill missing values
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
median_rating = anime_df['rating'].median()
anime_df['rating'].fillna(median_rating, inplace=True)
# Clean genres
anime_df['genre'] = anime_df['genre'].replace({'&amp;': '&'}, regex=True)
# TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])
# cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Function to recommend anime
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = anime_df[anime_df['name'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar anime
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]
# Get recommendations for a given anime
print(get_recommendations('Shakugan no Shana'))

986                         Shakugan no Shana
1604                      Shakugan no Shana S
1003            Shakugan no Shana III (Final)
0                              Kimi no Na wa.
5805              Wind: A Breath of Heart OVA
6394             Wind: A Breath of Heart (TV)
1111    Aura: Maryuuin Kouga Saigo no Tatakai
2103                            Clannad Movie
4572                                  Rewrite
159                              Angel Beats!
Name: name, dtype: object


In [2]:
# groupby-based recommendation system
import pandas as pd

# Load the dataset again
file_path = r"D:\Excelr\Data Science\Data Science Assignment\Recommendation System\Recommendation System\anime.csv"
anime_df = pd.read_csv(file_path)
# Fill missing values as before
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
median_rating = anime_df['rating'].median()
anime_df['rating'].fillna(median_rating, inplace=True)
# Group the anime by 'genre' and calculate the mean rating and members count for each genre
grouped_anime = anime_df.groupby('genre').agg({'rating': 'mean', 'members': 'mean'}).sort_values(by='rating', ascending=False)
# Display the top genres sorted by average rating
grouped_anime.head()

Unnamed: 0_level_0,rating,members
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
"Action, Adventure, Drama, Fantasy, Magic, Military, Shounen",9.26,793665.0
"Drama, Fantasy, Romance, Slice of Life, Supernatural",9.06,456749.0
"Drama, School, Shounen",9.05,102733.0
"Adventure, Drama, Supernatural",8.93,466254.0
"Drama, Music, Romance, School, Shounen",8.92,416397.0


In [3]:
# Function to recommend anime based on a given anime's genre
def recommend_anime_by_genre(anime_title, anime_df, top_n=5):
    # Find the genre of the given anime
    genre = anime_df[anime_df['name'] == anime_title]['genre'].values
    
    if len(genre) == 0:
        return f"Anime titled '{anime_title}' not found."
    
    # Filter other anime from the same genre
    genre_anime = anime_df[anime_df['genre'] == genre[0]]
    
    # Sort by rating and return top N recommendations excluding the original anime
    recommendations = genre_anime[genre_anime['name'] != anime_title].sort_values(by='rating', ascending=False).head(top_n)
    
    return recommendations[['name', 'rating', 'members']]

# Get recommendations based on 'Kimi no Na wa.'
recommend_anime_by_genre('Wind: A Breath of Heart (TV)', anime_df)

Unnamed: 0,name,rating,members
0,Kimi no Na wa.,9.37,200630
5805,Wind: A Breath of Heart OVA,6.35,2043


In [4]:
# Load the dataset
anime_df = pd.read_csv(r"D:\Excelr\Data Science\Data Science Assignment\Recommendation System\Recommendation System\anime.csv")

# Check for missing values
missing_values = anime_df.isnull().sum()

# summary of the dataset
dataset_shape = anime_df.shape
dataset_info = anime_df.info()
dataset_description = anime_df.describe()

missing_values, dataset_shape, dataset_info, dataset_description

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


(anime_id      0
 name          0
 genre        62
 type         25
 episodes      0
 rating      230
 members       0
 dtype: int64,
 (12294, 7),
 None,
            anime_id        rating       members
 count  12294.000000  12064.000000  1.229400e+04
 mean   14058.221653      6.473902  1.807134e+04
 std    11455.294701      1.026746  5.482068e+04
 min        1.000000      1.670000  5.000000e+00
 25%     3484.250000      5.880000  2.250000e+02
 50%    10260.500000      6.570000  1.550000e+03
 75%    24794.500000      7.180000  9.437000e+03
 max    34527.000000     10.000000  1.013917e+06)

In [5]:
# Fill missing values in the 'rating' column with the median rating
median_rating = anime_df['rating'].median()
anime_df['rating'].fillna(median_rating, inplace=True)

# missing values in 'rating' column are filled
anime_df['rating'].isnull().sum()

0

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert 'genre' into numerical format using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
# TF-IDF vectorizer to the 'genre' column
anime_df['genre'] = anime_df['genre'].fillna('Unknown')  # Handle any remaining missing genres
tfidf_matrix_genre = tfidf.fit_transform(anime_df['genre'])
# shape of the TF-IDF matrix for genres
tfidf_matrix_genre.shape

(12294, 47)

In [7]:
# Convert 'type' into numerical format using one-hot encoding
anime_df['type'] = anime_df['type'].fillna('Unknown')  # Handle any missing values
type_one_hot = pd.get_dummies(anime_df['type'], prefix='type')
# shape of the one-hot encoded 'type' matrix
type_one_hot.shape

(12294, 7)

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the 'rating' column using MinMaxScaler
scaler = MinMaxScaler()
anime_df['rating_normalized'] = scaler.fit_transform(anime_df[['rating']])
# normalized rating
anime_df[['rating', 'rating_normalized']].head()

Unnamed: 0,rating,rating_normalized
0,9.37,0.92437
1,9.26,0.911164
2,9.25,0.909964
3,9.17,0.90036
4,9.16,0.89916


In [9]:
import numpy as np
from scipy.sparse import hstack

# Combine the TF-IDF genre matrix, one-hot encoded 'type', and normalized 'rating'
# Convert 'rating_normalized' to a sparse matrix and concatenate with other features
rating_matrix = np.array(anime_df['rating_normalized']).reshape(-1, 1)
# Combine all feature matrices: genre (TF-IDF), type (one-hot), and rating (normalized)
combined_features = hstack([tfidf_matrix_genre, type_one_hot, rating_matrix])
# shape of the combined feature matrix
combined_features.shape

(12294, 55)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity matrix based on the combined features
cosine_sim = cosine_similarity(combined_features, combined_features)

# shape of the cosine similarity matrix
cosine_sim.shape

(12294, 12294)

In [11]:
# cosine similarity matrix
cosine_sim = cosine_similarity(combined_features, combined_features)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset again
file_path = r"D:\Excelr\Data Science\Data Science Assignment\Recommendation System\Recommendation System\anime.csv"
anime_df = pd.read_csv(file_path)
# Fill missing values as done earlier
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
median_rating = anime_df['rating'].median()
anime_df['rating'].fillna(median_rating, inplace=True)
# Split the dataset into training and testing sets (20% test size)
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)
# training and testing sets
train_df.shape, test_df.shape

((9835, 7), (2459, 7))

In [16]:
# Recommendation function to recommend anime from the training set only
def recommend_anime_train_only(anime_title, train_df, cosine_sim, top_n=10):
    if anime_title not in train_df['name'].values:
        return []
    
    # Get the index of the anime that matches the title in the training set
    idx = train_df[train_df['name'] == anime_title].index[0]
    
    # Get the similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top N similar anime from the training set
    sim_scores = sim_scores[1:top_n+1]  # Exclude the anime itself
    
    # Get the anime names from the training set
    anime_indices = [i[0] for i in sim_scores]
    return train_df[['name']].iloc[anime_indices].values.flatten()

In [17]:
# Define evaluation metrics: Precision, Recall, F1-Score
def calculate_metrics(test_df, train_df, cosine_sim, top_n=10):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for anime in test_df['name'].values:
        recommended_anime = recommend_anime_train_only(anime, train_df, cosine_sim, top_n=top_n)
        
        # Check for overlap between recommended and test set anime
        if anime in recommended_anime:
            true_positives += 1
        else:
            false_negatives += 1
        
        # False positives would be any recommendations not in the test set
        for rec in recommended_anime:
            if rec not in test_df['name'].values:
                false_positives += 1
    
    # Calculate precision, recall, and F1-score
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

# metrics based on the recommendations
precision, recall, f1_score = calculate_metrics(test_df, train_df, cosine_sim, top_n=10)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")

Precision: 0
Recall: 0.0
F1-Score: 0
