In [None]:
# Recommendation System

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

# Assign the result of pd.read_csv to the variable 'df'
df = pd.read_csv('/content/anime.csv')

In [10]:
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [11]:
df.dtypes

Unnamed: 0,0
anime_id,int64
name,object
genre,object
type,object
episodes,object
rating,float64
members,int64


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [9]:
# 1. Handle Missing Values
# Handle missing values in 'genre' by filling with an empty string
df['genre'].fillna('', inplace=True) # Replace anime_data with df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['genre'].fillna('', inplace=True) # Replace anime_data with df


In [13]:
# Handle missing values in 'rating' by filling with the mean rating
df['rating'].fillna(df['rating'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)


In [18]:
# Replace 'Unknown' in the 'episodes' column with np.nan, and then convert to float
df['episodes'] = df['episodes'].replace('Unknown', np.nan).astype(float)


In [17]:
# Alternatively to fill NaN values with a default (like 0 or median)
df['episodes'].fillna(0, inplace=True)  # Replace NaN with 0

In [19]:
# 2. Explore and Understand the Data Structure
print("Data Info:")
print(df.info())  # Check column types and non-null values


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  float64
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB
None


In [20]:
print("\nData Description:")
print(df.describe())  # Summary statistics for numerical columns



Data Description:
           anime_id      episodes        rating       members
count  12294.000000  12294.000000  12294.000000  1.229400e+04
mean   14058.221653     12.040101      6.473902  1.807134e+04
std    11455.294701     46.257299      1.017096  5.482068e+04
min        1.000000      0.000000      1.670000  5.000000e+00
25%     3484.250000      1.000000      5.900000  2.250000e+02
50%    10260.500000      2.000000      6.550000  1.550000e+03
75%    24794.500000     12.000000      7.170000  9.437000e+03
max    34527.000000   1818.000000     10.000000  1.013917e+06


In [21]:
print("\nMissing Values:")
print(df.isnull().sum())  # Check how many missing values remain (if any)



Missing Values:
anime_id     0
name         0
genre        0
type        25
episodes     0
rating       0
members      0
dtype: int64


In [None]:
# 3. Feature Extraction


In [22]:
# Convert 'genre' to numerical form (One-Hot Encoding)
df['genre_list'] = df['genre'].apply(lambda x: x.split(', '))
genre_dummies = df['genre_list'].str.join('|').str.get_dummies()


In [23]:
# Concatenate genre dummies to the original dataframe
anime_cleaned = pd.concat([df, genre_dummies], axis=1)


In [24]:
# Drop intermediate columns if not needed anymore
anime_cleaned.drop(['genre_list', 'genre'], axis=1, inplace=True)

In [None]:
# 4. Normalize Numerical Features

In [25]:
# Initialize a MinMaxScaler to normalize 'rating', 'episodes', and 'members'
scaler = MinMaxScaler()
anime_cleaned[['rating', 'episodes', 'members']] = scaler.fit_transform(anime_cleaned[['rating', 'episodes', 'members']])


In [26]:
# Verify the changes
print(anime_cleaned.head())  # Display the first few rows of the cleaned and transformed data

   anime_id                              name   type  episodes    rating  \
0     32281                    Kimi no Na wa.  Movie  0.000550  0.924370   
1      5114  Fullmetal Alchemist: Brotherhood     TV  0.035204  0.911164   
2     28977                          Gintama°     TV  0.028053  0.909964   
3      9253                       Steins;Gate     TV  0.013201  0.900360   
4      9969                     Gintama&#039;     TV  0.028053  0.899160   

    members  Action  Adventure  Cars  Comedy  ...  Shounen Ai  Slice of Life  \
0  0.197872       0          0     0       0  ...           0              0   
1  0.782770       1          1     0       0  ...           0              0   
2  0.112689       1          0     0       1  ...           0              0   
3  0.664325       0          0     0       0  ...           0              0   
4  0.149186       1          0     0       1  ...           0              0   

   Space  Sports  Super Power  Supernatural  Thriller  Vampire

In [None]:
# 4. Recommendation System

In [27]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
#Cosine Similarity Calculation

In [28]:
# Select relevant features for similarity calculation (genres + rating, episodes, members)
features = ['rating', 'episodes', 'members'] + list(genre_dummies.columns)

In [29]:
# Compute cosine similarity based on the selected features
cosine_sim = cosine_similarity(anime_cleaned[features])

In [None]:
# it define a function that will help us recommend similar anime based on this cosine similarity matrix

In [None]:
#Recommendation Function

In [50]:
def get_recommendations(anime_name, threshold=0.8, cosine_sim=cosine_sim, anime_cleaned=anime_cleaned):
    """
    Get anime recommendations based on cosine similarity.

    Args:
        anime_name: The name of the anime to get recommendations for.
        threshold: The minimum similarity score to consider.
        cosine_sim: The cosine similarity matrix.
        anime_cleaned: The cleaned anime DataFrame.

    Returns:
        A DataFrame containing the top 10 most similar anime.
    """
    # Get the index of the anime by its name
    idx = anime_cleaned[anime_cleaned['name'] == anime_name].index[0]

    # Get the similarity scores for the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Apply threshold to filter out low-similarity scores (if desired)
    sim_scores = [(i, score) for i, score in sim_scores if score >= threshold]

    # Get the indices of the top 10 most similar anime (excluding the anime itself)
    anime_indices = [i[0] for i in sim_scores if i[0] != idx][:10]

    # Return the top 10 most similar anime along with their similarity scores
    return anime_cleaned.iloc[anime_indices][['name', 'rating']]

# Test the recommendation function, call function with keyword arguments to avoid ambiguity
recommendations = get_recommendations(anime_name="Steins;Gate", anime_cleaned=anime_cleaned, cosine_sim=cosine_sim, threshold=0.6)
print(recommendations)

                                                    name    rating
59            Steins;Gate Movie: Fuka Ryouiki no Déjà vu  0.833133
126                Steins;Gate: Oukoubakko no Poriomania  0.815126
196    Steins;Gate: Kyoukaimenjou no Missing Link - D...  0.800720
10898                                      Steins;Gate 0  0.576699
5126                                       Under the Dog  0.585834
5525                                        Loups=Garous  0.571429
6889                                  Loups=Garous Pilot  0.504202
5283                   Final Fantasy: The Spirits Within  0.579832
1578            Sakasama no Patema: Beginning of the Day  0.699880
1594                                  Mai-Otome 0: S.ifr  0.698679


In [None]:
# Evaluation

In [51]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(anime_cleaned, test_size=0.2, random_state=42)


In [52]:
# Let's evaluate the recommendation system using precision, recall, and F1-score

def evaluate_recommendation_system(test_data, train_data, cosine_sim, threshold=0.5):
    y_true = []  # Actual genres of anime in test set
    y_pred = []  # Predicted genres based on recommendations

    for idx, anime in test_data.iterrows():
        # Get the true genre of the anime in the test set (assumed to be binary)
        true_genre = set(anime.index[7:])  # Assuming genres start from index 7


In [66]:
def evaluate_recommendation_system(test_data, train_data, cosine_sim, threshold=0.5):
    y_true = []  # Actual genres of anime in test set
    y_pred = []  # Predicted genres based on recommendations

    # Assuming that the genre columns are one-hot encoded and start after specific columns
    genre_columns = train_data.columns[train_data.columns.str.contains('|'.join(genre_dummies.columns))]  # All genre columns

    for idx, anime in test_data.iterrows():
        # Get the true genre of the anime in the test set (binary values for genres)
        # Iterate through genre columns to find those with value 1 for the current anime
        true_genre = set()
        for genre_col in genre_columns:
            if anime[genre_col] == 1:
                true_genre.add(genre_col)

        # Get recommendations for the anime using the training data
        recommendations = get_recommendations(anime['name'], train_data, cosine_sim, threshold)

        # Get the genres of the recommended anime (binary)
        # Assuming recommendations.index gives you the indices of recommended anime in train_data
        predicted_genre = set()
        for genre_col in genre_columns:
            if any(train_data.loc[recommendations.index, genre_col] == 1):
                predicted_genre.add(genre_col)

        # Calculate true positives (matching genres)
        true_positive = len(true_genre & predicted_genre)
        false_positive = len(predicted_genre - true_genre)
        false_negative = len(true_genre - predicted_genre)

        # For each anime, append the number of true positives and predicted positives
        y_true.append(true_positive)
        y_pred.append(len(predicted_genre))

    # Compute precision, recall, and F1-score
    from sklearn.metrics import precision_score, recall_score, f1_score  # Importing necessary metrics
    precision = precision_score(y_true, y_pred, average='macro', zero_division=1)  # Handling zero division
    recall = recall_score(y_true, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=1)

    return precision, recall, f1

# Interview Questions:

# 1. Can you explain the difference between user-based and item-based collaborative filtering?

User-Based Collaborative Filtering
Focus: Recommends items based on the preferences of similar users.

How It Works:

1.Finds users who are similar to the target user based on their ratings.
2.Recommends items that those similar users liked.
Example: If User A and User B have similar tastes, and User A liked a movie that User B hasn't seen, that movie might be recommended to User B.

Item-Based Collaborative Filtering

Focus: Recommends items based on the similarity of items themselves.

How It Works:

1.Finds items similar to those the target user has liked or rated highly.
2.Recommends those similar items to the user.
Example: If a user liked Movie A, and Movie A is similar to Movie B, then Movie B might be recommended to that user.


# 2. What is collaborative filtering, and how does it work?

What is Collaborative Filtering?

Collaborative Filtering is a technique used in recommendation systems to suggest items (like movies, products, or music) to users based on the preferences of other users. It relies on the idea that if users agree in the past, they will likely agree in the future.

How Does It Work?

1.Data Collection:

Gather data on users and their interactions with items. This can include ratings, likes, purchases, or clicks.

2.User-Item Matrix:

Create a matrix where rows represent users and columns represent items. Each cell contains a score showing how much a user likes an item (like a rating).

3.Calculate Similarity:

Determine how similar users are to each other or how similar items are to each other using methods like:
Cosine Similarity: Measures the angle between two user/item vectors.
Pearson Correlation: Measures how closely two users/items relate to each other.

4.Make Recommendations:

User-Based: Find users similar to the target user and recommend items they liked.
Item-Based: Find items similar to those the user has already liked and recommend those.

Example in Simple Terms
1.If User A and User B both liked similar movies, and User A liked a new movie, then that new movie might be recommended to User B.

2.If Movie A is similar to Movie B, and a user liked Movie A, then Movie B might be suggested to that user.
