In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler,MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
df=pd.read_csv('anime.csv')

In [3]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.shape

(12294, 7)

In [7]:
df.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [8]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [9]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [10]:
# Handle Missing Values
df['type'] = df['type'].fillna('type')
df.dropna(subset=['anime_id', 'type','rating',], inplace=True) 

In [11]:
df['type']

0        Movie
1           TV
2           TV
3           TV
4           TV
         ...  
12289      OVA
12290      OVA
12291      OVA
12292      OVA
12293    Movie
Name: type, Length: 12064, dtype: object

In [12]:
df['genre'] = df['genre'].fillna('unkown')
df['genre']

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12064, dtype: object

In [13]:
# Preprocess Genres
df['genre'] = df['genre'].str.split(', ')
df['genre']

0                   [Drama, Romance, School, Supernatural]
1        [Action, Adventure, Drama, Fantasy, Magic, Mil...
2        [Action, Comedy, Historical, Parody, Samurai, ...
3                                       [Sci-Fi, Thriller]
4        [Action, Comedy, Historical, Parody, Samurai, ...
                               ...                        
12289                                             [Hentai]
12290                                             [Hentai]
12291                                             [Hentai]
12292                                             [Hentai]
12293                                             [Hentai]
Name: genre, Length: 12064, dtype: object

In [14]:
# One-hot encode genres
mlb = MultiLabelBinarizer()
mlb

In [15]:
genres_encoded = mlb.fit_transform(df['genre'])
genres_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
# One-Hot Encode Broadcast Type
broadcast_encoded = pd.get_dummies(df['anime_id'], prefix='broadcast')
broadcast_encoded

Unnamed: 0,broadcast_1,broadcast_5,broadcast_6,broadcast_7,broadcast_8,broadcast_15,broadcast_16,broadcast_17,broadcast_18,broadcast_19,...,broadcast_34412,broadcast_34447,broadcast_34453,broadcast_34464,broadcast_34475,broadcast_34476,broadcast_34490,broadcast_34503,broadcast_34514,broadcast_34519
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12290,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12291,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
# Scale Numerical Features
scaler = MinMaxScaler()
scaler

In [38]:
df['rating_scaled'] = scaler.fit_transform(df[['rating']])
print(df['rating_scaled'] )

0        0.924370
1        0.911164
2        0.909964
3        0.900360
4        0.899160
           ...   
12289    0.297719
12290    0.313325
12291    0.385354
12292    0.397359
12293    0.454982
Name: rating_scaled, Length: 12064, dtype: float64


In [40]:
df['community_members'] = scaler.fit_transform(df[['members']])
df['community_members']

0        0.197867
1        0.782769
2        0.112683
3        0.664323
4        0.149180
           ...   
12289    0.000196
12290    0.000169
12291    0.000204
12292    0.000161
12293    0.000128
Name: community_members, Length: 12064, dtype: float64

In [44]:
# Combine Features
features = np.hstack((genres_encoded,broadcast_encoded.values, df[['rating_scaled', 'community_members']].values))
print(features)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  9.24369748e-01 1.97866664e-01]
 [1.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  9.11164466e-01 7.82768603e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  9.09963986e-01 1.12683141e-01]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  3.85354142e-01 2.04161139e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  3.97358944e-01 1.60764569e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  4.54981993e-01 1.28217141e-04]]


### Recommendation System:

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [48]:

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features)

In [52]:
# Create a function to recommend similar anime
def recommend_anime(title, df, sim_matrix, top_n=5):
    idx = df.index[df['name'] == title].tolist()
    if not idx:
        return "Anime not found."
    
    idx = idx[0]
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude itself
    
    anime_indices = [i[0] for i in sim_scores]
    return df.iloc[anime_indices][['name', 'genre', 'rating']]

# Example usage
recommend_anime("Naruto",df, cosine_sim)

Unnamed: 0,name,genre,rating
615,Naruto: Shippuuden,"[Action, Comedy, Martial Arts, Shounen, Super ...",7.94
486,Boruto: Naruto the Movie,"[Action, Comedy, Martial Arts, Shounen, Super ...",8.03
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"[Action, Comedy, Martial Arts, Shounen, Super ...",7.53
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"[Action, Comedy, Martial Arts, Shounen, Super ...",7.5
1343,Naruto x UT,"[Action, Comedy, Martial Arts, Shounen, Super ...",7.58


### Evaluation:

In [56]:
# Split dataset (dummy approach, actual evaluation varies)
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [58]:
# Check performance on test set (example using a simple threshold)
threshold = 0.5
predictions = (cosine_sim > threshold).astype(int)

In [60]:
# Assuming ground truth relevance (dummy values)
true_labels = np.random.randint(0, 2, size=predictions.shape)  # Replace with real labels

In [62]:
precision = precision_score(true_labels.flatten(), predictions.flatten(), average='weighted')
recall = recall_score(true_labels.flatten(), predictions.flatten(), average='weighted')
f1 = f1_score(true_labels.flatten(), predictions.flatten(), average='weighted')

In [63]:
print(f'Precision: {precision}, Recall: {recall}, F1-score: {f1}')

Precision: 0.5001096712994334, Recall: 0.4999984402923576, F1-score: 0.36059346785397717


### Interview Questions: