In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score


In [3]:
# Load the dataset
anime_df = pd.read_csv('/content/anime.csv')

# Display the first few rows
anime_df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# Check for missing values
print(anime_df.isnull().sum())

# Fill missing values in 'rating' with the mean value
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

# Drop any remaining rows with missing values (if necessary)
anime_df.dropna(inplace=True)


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [5]:
# Dataset information
anime_df.info()

# Summary statistics
anime_df.describe()


<class 'pandas.core.frame.DataFrame'>
Index: 12210 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12210 non-null  int64  
 1   name      12210 non-null  object 
 2   genre     12210 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12210 non-null  object 
 5   rating    12210 non-null  float64
 6   members   12210 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 763.1+ KB


Unnamed: 0,anime_id,rating,members
count,12210.0,12210.0,12210.0
mean,13936.486486,6.478195,18178.71
std,11398.045316,1.015732,54989.78
min,1.0,1.67,5.0
25%,3460.25,5.9,229.0
50%,10168.5,6.55,1571.0
75%,24442.5,7.17,9530.0
max,34527.0,10.0,1013917.0


In [6]:
# Extract features
features = anime_df[['genre', 'rating', 'episodes']]


In [7]:
# Label encode the genre column
encoder = LabelEncoder()
anime_df['genre_encoded'] = encoder.fit_transform(anime_df['genre'])

# Update features with the encoded genre
features = anime_df[['genre_encoded', 'rating', 'episodes']]


In [10]:
# Normalize numerical features
scaler = MinMaxScaler()

# Select only numerical columns for scaling
numerical_features = features[['rating', 'episodes']]

# Convert 'episodes' column to numeric, handling non-numeric values
numerical_features['episodes'] = pd.to_numeric(numerical_features['episodes'], errors='coerce')  # 'coerce' will replace non-numeric values with NaN

# Drop rows with NaN values in 'episodes' resulting from the conversion
numerical_features.dropna(inplace=True)

# Now scale the numerical features
features_scaled = scaler.fit_transform(numerical_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_features['episodes'] = pd.to_numeric(numerical_features['episodes'], errors='coerce')  # 'coerce' will replace non-numeric values with NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_features.dropna(inplace=True)


In [12]:
# Compute cosine similarity
cos_sim = cosine_similarity(features_scaled)

# Get the names corresponding to the remaining rows after preprocessing
remaining_names = numerical_features.index  # Get indices from the preprocessed numerical features

# Create a DataFrame to hold similarity values, using the correct names
similarity_df = pd.DataFrame(cos_sim, index=remaining_names, columns=remaining_names)

In [14]:
def recommend_anime(anime_name, top_n=10, threshold=0.5):
    # Handle the case where the anime name might not be in the similarity DataFrame
    if anime_name not in similarity_df.columns:
        return f"Anime '{anime_name}' not found in the dataset."  # Return a message if anime not found

    # Get the similarity scores for the target anime
    sim_scores = similarity_df[anime_name]

    # Filter out anime with a similarity score below the threshold
    sim_scores = sim_scores[sim_scores >= threshold]

    # Sort and get the top N similar anime
    top_anime = sim_scores.sort_values(ascending=False).head(top_n)

    return top_anime.index.tolist()  # Return recommendations if anime is found

# Example: Recommend similar anime to a specific title
recommend_anime('Naruto')

"Anime 'Naruto' not found in the dataset."

In [15]:
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)


In [16]:
# Assume a function 'evaluate_recommendations' that checks the presence of test anime in recommendations
def evaluate_recommendations(test_df, top_n=10, threshold=0.5):
    y_true = []
    y_pred = []

    for anime in test_df['name']:
        recommended = recommend_anime(anime, top_n, threshold)
        y_true.append(anime)
        y_pred.append(recommended)

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    f1 = f1_score(y_true, y_pred, average='micro')

    return precision, recall, f1

# Evaluate the system
precision, recall, f1 = evaluate_recommendations(test_df)
print(f'Precision: {precision}\nRecall: {recall}\nF1 Score: {f1}')


Precision: 0.0
Recall: 0.0
F1 Score: 0.0
