In [10]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# --- 2. Load Dataset ---
df = pd.read_csv("/content/anime.csv")
print("Original Data Shape:", df.shape)

# --- 3. Data Cleaning ---
# Drop duplicates
df.drop_duplicates(subset='name', inplace=True)

# Fill missing values
df['genre'] = df['genre'].fillna('')
df['type'] = df['type'].fillna('Unknown')
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce').fillna(0).astype(int)
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['members'] = df['members'].fillna(0)

print("After Cleaning Shape:", df.shape)

# --- 4. Feature Engineering ---
# Text feature: combine genre + type
df['text_features'] = df['genre'] + " " + df['type']

# Vectorize text features
vectorizer = CountVectorizer()
text_matrix = vectorizer.fit_transform(df['text_features'])

# Numeric features: episodes, rating, members
scaler = MinMaxScaler()
num_matrix = scaler.fit_transform(df[['episodes', 'rating', 'members']])

# Combine text and numeric features
from scipy.sparse import hstack
final_matrix = hstack([text_matrix, num_matrix])

# --- 5. Cosine Similarity Matrix ---
cosine_sim = cosine_similarity(final_matrix, final_matrix)

# --- 6. Recommendation Function ---
def recommend(anime_title, top_n=5, threshold=None):
    if anime_title not in df['name'].values:
        return f"Anime '{anime_title}' not found in dataset."

    idx = df[df['name'] == anime_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Remove itself (first one)
    sim_scores = sim_scores[1:]

    if threshold:
        sim_scores = [s for s in sim_scores if s[1] >= threshold]

    top_indices = [i[0] for i in sim_scores[:top_n]]
    return df.iloc[top_indices][['name', 'genre', 'type', 'episodes', 'rating', 'members']]

# --- 7. Train-Test Split for Evaluation ---
train, test = train_test_split(df['name'], test_size=0.2, random_state=42)

# Dummy evaluation: check if recommended titles appear in test set
y_true, y_pred = [], []
for anime in train.sample(10, random_state=42):
    recs = recommend(anime, top_n=10)
    rec_list = recs['name'].tolist() if isinstance(recs, pd.DataFrame) else []

    y_true.append(1 if any(title in test.values for title in rec_list) else 0)
    y_pred.append(1 if rec_list else 0)

# --- 8. Metrics ---
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1-Score:  {f1:.2f}")

# --- 9. Example Output ---
print("\nRecommendations for 'Naruto':\n")
print(recommend("Naruto", top_n=5))


Original Data Shape: (12294, 7)
After Cleaning Shape: (12292, 7)
Precision: 0.80
Recall:    1.00
F1-Score:  0.89

Recommendations for 'Naruto':

                    name                                              genre  \
615   Naruto: Shippuuden  Action, Comedy, Martial Arts, Shounen, Super P...   
206        Dragon Ball Z  Action, Adventure, Comedy, Fantasy, Martial Ar...   
588      Dragon Ball Kai  Action, Adventure, Comedy, Fantasy, Martial Ar...   
1930   Dragon Ball Super  Action, Adventure, Comedy, Fantasy, Martial Ar...   
2615          Medaka Box  Action, Comedy, Ecchi, Martial Arts, School, S...   

     type  episodes  rating  members  
615    TV         0    7.94   533578  
206    TV       291    8.32   375662  
588    TV        97    7.95   116832  
1930   TV         0    7.40   111443  
2615   TV        12    7.21   110042  


Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

In user-based collaborative filtering, the system finds users with similar tastes or behavior to the target user and recommends items those similar users liked. In item-based collaborative filtering, the system finds items similar to what the target user has liked before and recommends those similar items, regardless of other users’ preferences


2. What is collaborative filtering, and how does it work?

Collaborative filtering is a recommendation method that predicts a user’s interests by learning from patterns in past behavior, such as ratings, purchases, or clicks. It works by finding similarities—either between users or between items—and using those similarities to suggest new items that the user is likely to enjoy.